914 files changed, 26715 insertions, 19234 deletions
diff --git a/arch/alpha/boot/Makefile b/arch/alpha/boot/Makefile
index cd143887380a..8399bd0e68e8 100644
--- a/arch/alpha/boot/Makefile
+++ b/arch/alpha/boot/Makefile
@@ -14,6 +14,9 @@ targets		:= vmlinux.gz vmlinux \
 		   tools/bootpzh bootloader bootpheader bootpzheader 
 OBJSTRIP	:= $(obj)/tools/objstrip
 
+HOSTCFLAGS	:= -Wall -I$(objtree)/usr/include
+BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj)
+
 # SRM bootable image.  Copy to offset 512 of a partition.
 $(obj)/bootimage: $(addprefix $(obj)/tools/,mkbb lxboot bootlx) $(obj)/vmlinux.nh
 	( cat $(obj)/tools/lxboot $(obj)/tools/bootlx $(obj)/vmlinux.nh ) > $@ 
@@ -96,13 +99,14 @@ $(obj)/tools/bootph: $(obj)/bootpheader $(OBJSTRIP) FORCE
 $(obj)/tools/bootpzh: $(obj)/bootpzheader $(OBJSTRIP) FORCE
 	$(call if_changed,objstrip)
 
-LDFLAGS_bootloader   := -static -uvsprintf -T  #-N -relax
-LDFLAGS_bootpheader  := -static -uvsprintf -T  #-N -relax
-LDFLAGS_bootpzheader := -static -uvsprintf -T  #-N -relax
+LDFLAGS_bootloader   := -static -T # -N -relax
+LDFLAGS_bootloader   := -static -T # -N -relax
+LDFLAGS_bootpheader  := -static -T # -N -relax
+LDFLAGS_bootpzheader := -static -T # -N -relax
 
-OBJ_bootlx   := $(obj)/head.o $(obj)/main.o
-OBJ_bootph   := $(obj)/head.o $(obj)/bootp.o
-OBJ_bootpzh  := $(obj)/head.o $(obj)/bootpz.o $(obj)/misc.o
+OBJ_bootlx   := $(obj)/head.o $(obj)/stdio.o $(obj)/main.o
+OBJ_bootph   := $(obj)/head.o $(obj)/stdio.o $(obj)/bootp.o
+OBJ_bootpzh  := $(obj)/head.o $(obj)/stdio.o $(obj)/bootpz.o $(obj)/misc.o
 
 $(obj)/bootloader: $(obj)/bootloader.lds $(OBJ_bootlx) $(LIBS_Y) FORCE
 	$(call if_changed,ld)
diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c
index 3baf2d1e908d..dd6eb4a33582 100644
--- a/arch/alpha/boot/main.c
+++ b/arch/alpha/boot/main.c
@@ -19,7 +19,6 @@
 
 #include "ksize.h"
 
-extern int vsprintf(char *, const char *, va_list);
 extern unsigned long switch_to_osf_pal(unsigned long nr,
 	struct pcb_struct * pcb_va, struct pcb_struct * pcb_pa,
 	unsigned long *vptb);
diff --git a/arch/alpha/boot/stdio.c b/arch/alpha/boot/stdio.c
new file mode 100644
index 000000000000..f844dae8a54a
--- /dev/null
+++ b/arch/alpha/boot/stdio.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright (C) Paul Mackerras 1997.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <stdarg.h>
+#include <stddef.h>
+
+size_t strnlen(const char * s, size_t count)
+{
+	const char *sc;
+
+	for (sc = s; count-- && *sc != '\0'; ++sc)
+		/* nothing */;
+	return sc - s;
+}
+
+# define do_div(n, base) ({						\
+	unsigned int __base = (base);					\
+	unsigned int __rem;						\
+	__rem = ((unsigned long long)(n)) % __base;			\
+	(n) = ((unsigned long long)(n)) / __base;			\
+	__rem;								\
+})
+
+
+static int skip_atoi(const char **s)
+{
+	int i, c;
+
+	for (i = 0; '0' <= (c = **s) && c <= '9'; ++*s)
+		i = i*10 + c - '0';
+	return i;
+}
+
+#define ZEROPAD	1		/* pad with zero */
+#define SIGN	2		/* unsigned/signed long */
+#define PLUS	4		/* show plus */
+#define SPACE	8		/* space if plus */
+#define LEFT	16		/* left justified */
+#define SPECIAL	32		/* 0x */
+#define LARGE	64		/* use 'ABCDEF' instead of 'abcdef' */
+
+static char * number(char * str, unsigned long long num, int base, int size, int precision, int type)
+{
+	char c,sign,tmp[66];
+	const char *digits="0123456789abcdefghijklmnopqrstuvwxyz";
+	int i;
+
+	if (type & LARGE)
+		digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+	if (type & LEFT)
+		type &= ~ZEROPAD;
+	if (base < 2 || base > 36)
+		return 0;
+	c = (type & ZEROPAD) ? '0' : ' ';
+	sign = 0;
+	if (type & SIGN) {
+		if ((signed long long)num < 0) {
+			sign = '-';
+			num = - (signed long long)num;
+			size--;
+		} else if (type & PLUS) {
+			sign = '+';
+			size--;
+		} else if (type & SPACE) {
+			sign = ' ';
+			size--;
+		}
+	}
+	if (type & SPECIAL) {
+		if (base == 16)
+			size -= 2;
+		else if (base == 8)
+			size--;
+	}
+	i = 0;
+	if (num == 0)
+		tmp[i++]='0';
+	else while (num != 0) {
+		tmp[i++] = digits[do_div(num, base)];
+	}
+	if (i > precision)
+		precision = i;
+	size -= precision;
+	if (!(type&(ZEROPAD+LEFT)))
+		while(size-->0)
+			*str++ = ' ';
+	if (sign)
+		*str++ = sign;
+	if (type & SPECIAL) {
+		if (base==8)
+			*str++ = '0';
+		else if (base==16) {
+			*str++ = '0';
+			*str++ = digits[33];
+		}
+	}
+	if (!(type & LEFT))
+		while (size-- > 0)
+			*str++ = c;
+	while (i < precision--)
+		*str++ = '0';
+	while (i-- > 0)
+		*str++ = tmp[i];
+	while (size-- > 0)
+		*str++ = ' ';
+	return str;
+}
+
+int vsprintf(char *buf, const char *fmt, va_list args)
+{
+	int len;
+	unsigned long long num;
+	int i, base;
+	char * str;
+	const char *s;
+
+	int flags;		/* flags to number() */
+
+	int field_width;	/* width of output field */
+	int precision;		/* min. # of digits for integers; max
+				   number of chars for from string */
+	int qualifier;		/* 'h', 'l', or 'L' for integer fields */
+	                        /* 'z' support added 23/7/1999 S.H.    */
+				/* 'z' changed to 'Z' --davidm 1/25/99 */
+
+
+	for (str=buf ; *fmt ; ++fmt) {
+		if (*fmt != '%') {
+			*str++ = *fmt;
+			continue;
+		}
+
+		/* process flags */
+		flags = 0;
+		repeat:
+			++fmt;		/* this also skips first '%' */
+			switch (*fmt) {
+				case '-': flags |= LEFT; goto repeat;
+				case '+': flags |= PLUS; goto repeat;
+				case ' ': flags |= SPACE; goto repeat;
+				case '#': flags |= SPECIAL; goto repeat;
+				case '0': flags |= ZEROPAD; goto repeat;
+				}
+
+		/* get field width */
+		field_width = -1;
+		if ('0' <= *fmt && *fmt <= '9')
+			field_width = skip_atoi(&fmt);
+		else if (*fmt == '*') {
+			++fmt;
+			/* it's the next argument */
+			field_width = va_arg(args, int);
+			if (field_width < 0) {
+				field_width = -field_width;
+				flags |= LEFT;
+			}
+		}
+
+		/* get the precision */
+		precision = -1;
+		if (*fmt == '.') {
+			++fmt;
+			if ('0' <= *fmt && *fmt <= '9')
+				precision = skip_atoi(&fmt);
+			else if (*fmt == '*') {
+				++fmt;
+				/* it's the next argument */
+				precision = va_arg(args, int);
+			}
+			if (precision < 0)
+				precision = 0;
+		}
+
+		/* get the conversion qualifier */
+		qualifier = -1;
+		if (*fmt == 'l' && *(fmt + 1) == 'l') {
+			qualifier = 'q';
+			fmt += 2;
+		} else if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L'
+			|| *fmt == 'Z') {
+			qualifier = *fmt;
+			++fmt;
+		}
+
+		/* default base */
+		base = 10;
+
+		switch (*fmt) {
+		case 'c':
+			if (!(flags & LEFT))
+				while (--field_width > 0)
+					*str++ = ' ';
+			*str++ = (unsigned char) va_arg(args, int);
+			while (--field_width > 0)
+				*str++ = ' ';
+			continue;
+
+		case 's':
+			s = va_arg(args, char *);
+			if (!s)
+				s = "<NULL>";
+
+			len = strnlen(s, precision);
+
+			if (!(flags & LEFT))
+				while (len < field_width--)
+					*str++ = ' ';
+			for (i = 0; i < len; ++i)
+				*str++ = *s++;
+			while (len < field_width--)
+				*str++ = ' ';
+			continue;
+
+		case 'p':
+			if (field_width == -1) {
+				field_width = 2*sizeof(void *);
+				flags |= ZEROPAD;
+			}
+			str = number(str,
+				(unsigned long) va_arg(args, void *), 16,
+				field_width, precision, flags);
+			continue;
+
+
+		case 'n':
+			if (qualifier == 'l') {
+				long * ip = va_arg(args, long *);
+				*ip = (str - buf);
+			} else if (qualifier == 'Z') {
+				size_t * ip = va_arg(args, size_t *);
+				*ip = (str - buf);
+			} else {
+				int * ip = va_arg(args, int *);
+				*ip = (str - buf);
+			}
+			continue;
+
+		case '%':
+			*str++ = '%';
+			continue;
+
+		/* integer number formats - set up the flags and "break" */
+		case 'o':
+			base = 8;
+			break;
+
+		case 'X':
+			flags |= LARGE;
+		case 'x':
+			base = 16;
+			break;
+
+		case 'd':
+		case 'i':
+			flags |= SIGN;
+		case 'u':
+			break;
+
+		default:
+			*str++ = '%';
+			if (*fmt)
+				*str++ = *fmt;
+			else
+				--fmt;
+			continue;
+		}
+		if (qualifier == 'l') {
+			num = va_arg(args, unsigned long);
+			if (flags & SIGN)
+				num = (signed long) num;
+		} else if (qualifier == 'q') {
+			num = va_arg(args, unsigned long long);
+			if (flags & SIGN)
+				num = (signed long long) num;
+		} else if (qualifier == 'Z') {
+			num = va_arg(args, size_t);
+		} else if (qualifier == 'h') {
+			num = (unsigned short) va_arg(args, int);
+			if (flags & SIGN)
+				num = (signed short) num;
+		} else {
+			num = va_arg(args, unsigned int);
+			if (flags & SIGN)
+				num = (signed int) num;
+		}
+		str = number(str, num, base, field_width, precision, flags);
+	}
+	*str = '\0';
+	return str-buf;
+}
+
+int sprintf(char * buf, const char *fmt, ...)
+{
+	va_list args;
+	int i;
+
+	va_start(args, fmt);
+	i=vsprintf(buf,fmt,args);
+	va_end(args);
+	return i;
+}
diff --git a/arch/alpha/boot/tools/objstrip.c b/arch/alpha/boot/tools/objstrip.c
index 367d53d031fc..dee82695f48b 100644
--- a/arch/alpha/boot/tools/objstrip.c
+++ b/arch/alpha/boot/tools/objstrip.c
@@ -27,6 +27,9 @@
 #include <linux/param.h>
 #ifdef __ELF__
 # include <linux/elf.h>
+# define elfhdr elf64_hdr
+# define elf_phdr elf64_phdr
+# define elf_check_arch(x) ((x)->e_machine == EM_ALPHA)
 #endif
 
 /* bootfile size must be multiple of BLOCK_SIZE: */
diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h
index 429e8cd0d78e..e5117766529e 100644
--- a/arch/alpha/include/asm/cmpxchg.h
+++ b/arch/alpha/include/asm/cmpxchg.h
@@ -66,6 +66,4 @@
 #undef __ASM__MB
 #undef ____cmpxchg
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #endif /* _ALPHA_CMPXCHG_H */
diff --git a/arch/alpha/include/asm/pci.h b/arch/alpha/include/asm/pci.h
index f7f680f7457d..8b02afeb6319 100644
--- a/arch/alpha/include/asm/pci.h
+++ b/arch/alpha/include/asm/pci.h
@@ -71,22 +71,6 @@ extern void pcibios_set_master(struct pci_dev *dev);
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_BOUNDARY;
-	*strategy_parameter = cacheline_size;
-}
 #endif
 
 /* TODO: integrate with include/asm-generic/pci.h ? */
diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h
index f61e1a56c378..4cb4b6d3452c 100644
--- a/arch/alpha/include/asm/types.h
+++ b/arch/alpha/include/asm/types.h
@@ -2,6 +2,5 @@
 #define _ALPHA_TYPES_H
 
 #include <asm-generic/int-ll64.h>
-#include <uapi/asm/types.h>
 
 #endif /* _ALPHA_TYPES_H */
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index c509d306db45..a56e608db2f9 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -3,7 +3,7 @@
 
 #include <uapi/asm/unistd.h>
 
-#define NR_SYSCALLS			511
+#define NR_SYSCALLS			514
 
 #define __ARCH_WANT_OLD_READDIR
 #define __ARCH_WANT_STAT64
diff --git a/arch/alpha/include/uapi/asm/unistd.h b/arch/alpha/include/uapi/asm/unistd.h
index d214a0358100..aa33bf5aacb6 100644
--- a/arch/alpha/include/uapi/asm/unistd.h
+++ b/arch/alpha/include/uapi/asm/unistd.h
@@ -472,5 +472,8 @@
 #define __NR_sched_setattr		508
 #define __NR_sched_getattr		509
 #define __NR_renameat2			510
+#define __NR_getrandom			511
+#define __NR_memfd_create		512
+#define __NR_execveat			513
 
 #endif /* _UAPI_ALPHA_UNISTD_H */
diff --git a/arch/alpha/kernel/core_irongate.c b/arch/alpha/kernel/core_irongate.c
index 00096df0f6ad..83d0a359a1b2 100644
--- a/arch/alpha/kernel/core_irongate.c
+++ b/arch/alpha/kernel/core_irongate.c
@@ -22,7 +22,6 @@
 #include <linux/bootmem.h>
 
 #include <asm/ptrace.h>
-#include <asm/pci.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
diff --git a/arch/alpha/kernel/err_ev6.c b/arch/alpha/kernel/err_ev6.c
index 253cf1a87481..51267ac5729b 100644
--- a/arch/alpha/kernel/err_ev6.c
+++ b/arch/alpha/kernel/err_ev6.c
@@ -6,7 +6,6 @@
  *	Error handling code supporting Alpha systems
  */
 
-#include <linux/init.h>
 #include <linux/sched.h>
 
 #include <asm/io.h>
diff --git a/arch/alpha/kernel/irq.c b/arch/alpha/kernel/irq.c
index 7b2be251c30f..51f2c8654253 100644
--- a/arch/alpha/kernel/irq.c
+++ b/arch/alpha/kernel/irq.c
@@ -19,7 +19,6 @@
 #include <linux/ptrace.h>
 #include <linux/interrupt.h>
 #include <linux/random.h>
-#include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index e51f578636a5..36dc91ace83a 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1019,14 +1019,13 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv,
  	if (tv) {
 		if (get_tv32((struct timeval *)&kts, tv))
 			return -EFAULT;
+		kts.tv_nsec *= 1000;
 	}
 	if (tz) {
 		if (copy_from_user(&ktz, tz, sizeof(*tz)))
 			return -EFAULT;
 	}
 
-	kts.tv_nsec *= 1000;
-
 	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
 }
 
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 1941a07b5811..84d13263ce46 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -236,12 +236,11 @@ release_thread(struct task_struct *dead_task)
 }
 
 /*
- * Copy an alpha thread..
+ * Copy architecture-specific thread state
  */
-
 int
 copy_thread(unsigned long clone_flags, unsigned long usp,
-	    unsigned long arg,
+	    unsigned long kthread_arg,
 	    struct task_struct *p)
 {
 	extern void ret_from_fork(void);
@@ -262,7 +261,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 			sizeof(struct switch_stack) + sizeof(struct pt_regs));
 		childstack->r26 = (unsigned long) ret_from_kernel_thread;
 		childstack->r9 = usp;	/* function */
-		childstack->r10 = arg;
+		childstack->r10 = kthread_arg;
 		childregs->hae = alpha_mv.hae_cache,
 		childti->pcb.usp = 0;
 		return 0;
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 99ac36d5de4e..2f24447fef92 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -63,7 +63,6 @@ static struct {
 enum ipi_message_type {
 	IPI_RESCHEDULE,
 	IPI_CALL_FUNC,
-	IPI_CALL_FUNC_SINGLE,
 	IPI_CPU_STOP,
 };
 
@@ -506,7 +505,6 @@ setup_profiling_timer(unsigned int multiplier)
 	return -EINVAL;
 }
 
-
 static void
 send_ipi_message(const struct cpumask *to_whom, enum ipi_message_type operation)
 {
@@ -552,10 +550,6 @@ handle_ipi(struct pt_regs *regs)
 			generic_smp_call_function_interrupt();
 			break;
 
-		case IPI_CALL_FUNC_SINGLE:
-			generic_smp_call_function_single_interrupt();
-			break;
-
 		case IPI_CPU_STOP:
 			halt();
 
@@ -606,7 +600,7 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 
 void arch_send_call_function_single_ipi(int cpu)
 {
-	send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC_SINGLE);
+	send_ipi_message(cpumask_of(cpu), IPI_CALL_FUNC);
 }
 
 static void
diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c
index 6f01d9ad7b81..72b59511e59a 100644
--- a/arch/alpha/kernel/srmcons.c
+++ b/arch/alpha/kernel/srmcons.c
@@ -237,8 +237,7 @@ srmcons_init(void)
 
 	return -ENODEV;
 }
-
-module_init(srmcons_init);
+device_initcall(srmcons_init);
 
 
 /*
diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c
index 79d69d7f63f8..15f42083bdb3 100644
--- a/arch/alpha/kernel/sys_eiger.c
+++ b/arch/alpha/kernel/sys_eiger.c
@@ -22,7 +22,6 @@
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/io.h>
-#include <asm/pci.h>
 #include <asm/pgtable.h>
 #include <asm/core_tsunami.h>
 #include <asm/hwrpb.h>
diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c
index f21d61fab678..24e41bd7d3c9 100644
--- a/arch/alpha/kernel/sys_marvel.c
+++ b/arch/alpha/kernel/sys_marvel.c
@@ -331,7 +331,7 @@ marvel_map_irq(const struct pci_dev *cdev, u8 slot, u8 pin)
 	pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &intline);
 	irq = intline;
 
-	msi_loc = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	msi_loc = dev->msi_cap;
 	msg_ctl = 0;
 	if (msi_loc) 
 		pci_read_config_word(dev, msi_loc + PCI_MSI_FLAGS, &msg_ctl);
diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c
index 700686d04869..2cfaa0e5c577 100644
--- a/arch/alpha/kernel/sys_nautilus.c
+++ b/arch/alpha/kernel/sys_nautilus.c
@@ -39,7 +39,6 @@
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/io.h>
-#include <asm/pci.h>
 #include <asm/pgtable.h>
 #include <asm/core_irongate.h>
 #include <asm/hwrpb.h>
diff --git a/arch/alpha/kernel/systbls.S b/arch/alpha/kernel/systbls.S
index 24789713f1ea..9b62e3fd4f03 100644
--- a/arch/alpha/kernel/systbls.S
+++ b/arch/alpha/kernel/systbls.S
@@ -529,6 +529,9 @@ sys_call_table:
 	.quad sys_sched_setattr
 	.quad sys_sched_getattr
 	.quad sys_renameat2			/* 510 */
+	.quad sys_getrandom
+	.quad sys_memfd_create
+	.quad sys_execveat
 
 	.size sys_call_table, . - sys_call_table
 	.type sys_call_table, @object
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 9c4c189eb22f..74aceead06e9 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -14,7 +14,6 @@
 #include <linux/tty.h>
 #include <linux/delay.h>
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/ratelimit.h>
 
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 9d0ac091a52a..4a905bd667e2 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -23,8 +23,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
 
@@ -107,7 +106,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
 
 	/* If we're in an interrupt context, or have no user context,
 	   we must not take the fault.  */
-	if (!mm || in_atomic())
+	if (!mm || faulthandler_disabled())
 		goto no_context;
 
 #ifdef CONFIG_ALPHA_LARGE_VMALLOC
diff --git a/arch/alpha/oprofile/op_model_ev4.c b/arch/alpha/oprofile/op_model_ev4.c
index 18aa9b4f94f1..086a0d5445c5 100644
--- a/arch/alpha/oprofile/op_model_ev4.c
+++ b/arch/alpha/oprofile/op_model_ev4.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/alpha/oprofile/op_model_ev5.c b/arch/alpha/oprofile/op_model_ev5.c
index c32f8a0ad925..c300f5ef3482 100644
--- a/arch/alpha/oprofile/op_model_ev5.c
+++ b/arch/alpha/oprofile/op_model_ev5.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/alpha/oprofile/op_model_ev6.c b/arch/alpha/oprofile/op_model_ev6.c
index 1c84cc257fc7..02edf5971614 100644
--- a/arch/alpha/oprofile/op_model_ev6.c
+++ b/arch/alpha/oprofile/op_model_ev6.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/alpha/oprofile/op_model_ev67.c b/arch/alpha/oprofile/op_model_ev67.c
index 34a57a126553..adb1744d20f3 100644
--- a/arch/alpha/oprofile/op_model_ev67.c
+++ b/arch/alpha/oprofile/op_model_ev67.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 
diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h
index 4dc64ddebece..05b5aaf5b0f9 100644
--- a/arch/arc/include/asm/futex.h
+++ b/arch/arc/include/asm/futex.h
@@ -53,7 +53,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -75,7 +75,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
@@ -104,7 +104,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
 	return ret;
 }
 
-/* Compare-xchg with preemption disabled.
+/* Compare-xchg with pagefaults disabled.
  *  Notes:
  *      -Best-Effort: Exchg happens only if compare succeeds.
  *          If compare fails, returns; leaving retry/looping to upper layers
@@ -121,7 +121,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();
 
 	/* TBD : can use llock/scond */
 	__asm__ __volatile__(
@@ -142,7 +142,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval,
 	: "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT)
 	: "cc", "memory");
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
 
 	*uval = val;
 	return val;
diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h
index cabd518cb253..7cc4ced5dbf4 100644
--- a/arch/arc/include/asm/io.h
+++ b/arch/arc/include/asm/io.h
@@ -20,6 +20,7 @@ extern void iounmap(const void __iomem *addr);
 
 #define ioremap_nocache(phy, sz)	ioremap(phy, sz)
 #define ioremap_wc(phy, sz)		ioremap(phy, sz)
+#define ioremap_wt(phy, sz)		ioremap(phy, sz)
 
 /* Change struct page to physical address */
 #define page_to_phys(page)		(page_to_pfn(page) << PAGE_SHIFT)
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6a2e006cbcce..d948e4e9d89c 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -86,7 +86,7 @@ void do_page_fault(unsigned long address, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 45df48ba0b12..325d6f3a596a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -15,6 +15,8 @@ config ARM
 	select CLONE_BACKWARDS
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select EDAC_SUPPORT
+	select EDAC_ATOMIC_SCRUB
 	select GENERIC_ALLOCATOR
 	select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI)
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 86217db2937a..992736b5229b 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -223,7 +223,7 @@ dtb-$(CONFIG_SOC_IMX25) += \
 	imx25-eukrea-mbimxsd25-baseboard-dvi-vga.dtb \
 	imx25-karo-tx25.dtb \
 	imx25-pdk.dtb
-dtb-$(CONFIG_SOC_IMX31) += \
+dtb-$(CONFIG_SOC_IMX27) += \
 	imx27-apf27.dtb \
 	imx27-apf27dev.dtb \
 	imx27-eukrea-mbimxsd27-baseboard.dtb \
diff --git a/arch/arm/boot/dts/am335x-bone-common.dtsi b/arch/arm/boot/dts/am335x-bone-common.dtsi
index c3255e0c90aa..dbb3f4d2bf84 100644
--- a/arch/arm/boot/dts/am335x-bone-common.dtsi
+++ b/arch/arm/boot/dts/am335x-bone-common.dtsi
@@ -223,6 +223,25 @@
 /include/ "tps65217.dtsi"
 
 &tps {
+	/*
+	 * Configure pmic to enter OFF-state instead of SLEEP-state ("RTC-only
+	 * mode") at poweroff.  Most BeagleBone versions do not support RTC-only
+	 * mode and risk hardware damage if this mode is entered.
+	 *
+	 * For details, see linux-omap mailing list May 2015 thread
+	 *	[PATCH] ARM: dts: am335x-bone* enable pmic-shutdown-controller
+	 * In particular, messages:
+	 *	http://www.spinics.net/lists/linux-omap/msg118585.html
+	 *	http://www.spinics.net/lists/linux-omap/msg118615.html
+	 *
+	 * You can override this later with
+	 *	&tps {  /delete-property/ ti,pmic-shutdown-controller;  }
+	 * if you want to use RTC-only mode and made sure you are not affected
+	 * by the hardware problems. (Tip: double-check by performing a current
+	 * measurement after shutdown: it should be less than 1 mA.)
+	 */
+	ti,pmic-shutdown-controller;
+
 	regulators {
 		dcdc1_reg: regulator@0 {
 			regulator-name = "vdds_dpr";
diff --git a/arch/arm/boot/dts/am335x-boneblack.dts b/arch/arm/boot/dts/am335x-boneblack.dts
index 5c42d259fa68..901739fcb85a 100644
--- a/arch/arm/boot/dts/am335x-boneblack.dts
+++ b/arch/arm/boot/dts/am335x-boneblack.dts
@@ -80,7 +80,3 @@
 		status = "okay";
 	};
 };
-
-&rtc {
-	system-power-controller;
-};
diff --git a/arch/arm/boot/dts/am335x-evmsk.dts b/arch/arm/boot/dts/am335x-evmsk.dts
index 87fc7a35e802..156d05efcb70 100644
--- a/arch/arm/boot/dts/am335x-evmsk.dts
+++ b/arch/arm/boot/dts/am335x-evmsk.dts
@@ -654,7 +654,7 @@
 	wlcore: wlcore@2 {
 		compatible = "ti,wl1271";
 		reg = <2>;
-		interrupt-parent = <&gpio1>;
+		interrupt-parent = <&gpio0>;
 		interrupts = <31 IRQ_TYPE_LEVEL_HIGH>; /* gpio 31 */
 		ref-clock-frequency = <38400000>;
 	};
diff --git a/arch/arm/boot/dts/am35xx-clocks.dtsi b/arch/arm/boot/dts/am35xx-clocks.dtsi
index 518b8fde88b0..18cc826e9db5 100644
--- a/arch/arm/boot/dts/am35xx-clocks.dtsi
+++ b/arch/arm/boot/dts/am35xx-clocks.dtsi
@@ -12,7 +12,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,am35xx-gate-clock";
 		clocks = <&ipss_ick>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <1>;
 	};
 
@@ -20,7 +20,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,gate-clock";
 		clocks = <&rmii_ck>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <9>;
 	};
 
@@ -28,7 +28,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,am35xx-gate-clock";
 		clocks = <&ipss_ick>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <2>;
 	};
 
@@ -36,7 +36,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,gate-clock";
 		clocks = <&pclk_ck>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <10>;
 	};
 
@@ -44,7 +44,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,am35xx-gate-clock";
 		clocks = <&ipss_ick>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <0>;
 	};
 
@@ -52,7 +52,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,gate-clock";
 		clocks = <&sys_ck>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <8>;
 	};
 
@@ -60,7 +60,7 @@
 		#clock-cells = <0>;
 		compatible = "ti,am35xx-gate-clock";
 		clocks = <&sys_ck>;
-		reg = <0x059c>;
+		reg = <0x032c>;
 		ti,bit-shift = <3>;
 	};
 };
diff --git a/arch/arm/boot/dts/am57xx-beagle-x15.dts b/arch/arm/boot/dts/am57xx-beagle-x15.dts
index 7128fad991ac..a42cc377a862 100644
--- a/arch/arm/boot/dts/am57xx-beagle-x15.dts
+++ b/arch/arm/boot/dts/am57xx-beagle-x15.dts
@@ -19,6 +19,7 @@
 		rtc0 = &mcp_rtc;
 		rtc1 = &tps659038_rtc;
 		rtc2 = &rtc;
+		display0 = &hdmi0;
 	};
 
 	memory {
@@ -103,6 +104,51 @@
 		pinctrl-names = "default";
 		pinctrl-0 = <&extcon_usb2_pins>;
 	};
+
+	hdmi0: connector {
+		compatible = "hdmi-connector";
+		label = "hdmi";
+
+		type = "a";
+
+		port {
+			hdmi_connector_in: endpoint {
+				remote-endpoint = <&tpd12s015_out>;
+			};
+		};
+	};
+
+	tpd12s015: encoder {
+		compatible = "ti,tpd12s015";
+
+		pinctrl-names = "default";
+		pinctrl-0 = <&tpd12s015_pins>;
+
+		gpios = <&gpio7 10 GPIO_ACTIVE_HIGH>,	/* gpio7_10, CT CP HPD */
+			<&gpio6 28 GPIO_ACTIVE_HIGH>,	/* gpio6_28, LS OE */
+			<&gpio7 12 GPIO_ACTIVE_HIGH>;	/* gpio7_12/sp1_cs2, HPD */
+
+		ports {
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			port@0 {
+				reg = <0>;
+
+				tpd12s015_in: endpoint {
+					remote-endpoint = <&hdmi_out>;
+				};
+			};
+
+			port@1 {
+				reg = <1>;
+
+				tpd12s015_out: endpoint {
+					remote-endpoint = <&hdmi_connector_in>;
+				};
+			};
+		};
+	};
 };
 
 &dra7_pmx_core {
@@ -122,6 +168,13 @@
 		>;
 	};
 
+	hdmi_pins: pinmux_hdmi_pins {
+		pinctrl-single,pins = <
+			0x408 (PIN_INPUT | MUX_MODE1)	/* i2c2_sda.hdmi1_ddc_scl */
+			0x40c (PIN_INPUT | MUX_MODE1)	/* i2c2_scl.hdmi1_ddc_sda */
+		>;
+	};
+
 	i2c3_pins_default: i2c3_pins_default {
 		pinctrl-single,pins = <
 			0x2a4 (PIN_INPUT| MUX_MODE10)	/* mcasp1_aclkx.i2c3_sda */
@@ -278,6 +331,14 @@
 			0x3e8 (PIN_INPUT_PULLUP | MUX_MODE14) /* uart1_ctsn.gpio7_24 */
 		>;
 	};
+
+	tpd12s015_pins: pinmux_tpd12s015_pins {
+		pinctrl-single,pins = <
+			0x3b0 (PIN_OUTPUT | MUX_MODE14)		/* gpio7_10 CT_CP_HPD */
+			0x3b8 (PIN_INPUT_PULLDOWN | MUX_MODE14)	/* gpio7_12 HPD */
+			0x370 (PIN_OUTPUT | MUX_MODE14)		/* gpio6_28 LS_OE */
+		>;
+	};
 };
 
 &i2c1 {
@@ -608,3 +669,23 @@
 		};
        };
 };
+
+&dss {
+	status = "ok";
+
+	vdda_video-supply = <&ldoln_reg>;
+};
+
+&hdmi {
+	status = "ok";
+	vdda-supply = <&ldo3_reg>;
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&hdmi_pins>;
+
+	port {
+		hdmi_out: endpoint {
+			remote-endpoint = <&tpd12s015_in>;
+		};
+	};
+};
diff --git a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
index a2cf2154dcdb..fdd187c55aa5 100644
--- a/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
+++ b/arch/arm/boot/dts/armada-xp-linksys-mamba.dts
@@ -95,6 +95,11 @@
 
 		internal-regs {
 
+			rtc@10300 {
+				/* No crystal connected to the internal RTC */
+				status = "disabled";
+			};
+
 			/* J10: VCC, NC, RX, NC, TX, GND  */
 			serial@12000 {
 				status = "okay";
diff --git a/arch/arm/boot/dts/dm816x.dtsi b/arch/arm/boot/dts/dm816x.dtsi
index de8427be830a..289806adb343 100644
--- a/arch/arm/boot/dts/dm816x.dtsi
+++ b/arch/arm/boot/dts/dm816x.dtsi
@@ -382,7 +382,7 @@
 			ti,hwmods = "usb_otg_hs";
 
 			usb0: usb@47401000 {
-				compatible = "ti,musb-am33xx";
+				compatible = "ti,musb-dm816";
 				reg = <0x47401400 0x400
 				       0x47401000 0x200>;
 				reg-names = "mc", "control";
@@ -422,7 +422,7 @@
 			};
 
 			usb1: usb@47401800 {
-				compatible = "ti,musb-am33xx";
+				compatible = "ti,musb-dm816";
 				reg = <0x47401c00 0x400
 				       0x47401800 0x200>;
 				reg-names = "mc", "control";
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index f03a091cd076..8f1e25bcecbd 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -131,6 +131,11 @@
 							regulator-max-microvolt = <3000000>;
 						};
 					};
+
+					scm_conf_clocks: clocks {
+						#address-cells = <1>;
+						#size-cells = <0>;
+					};
 				};
 
 				dra7_pmx_core: pinmux@1400 {
@@ -1469,6 +1474,44 @@
 			clocks = <&sys_clkin1>;
 			status = "disabled";
 		};
+
+		dss: dss@58000000 {
+			compatible = "ti,dra7-dss";
+			/* 'reg' defined in dra72x.dtsi and dra74x.dtsi */
+			/* 'clocks' defined in dra72x.dtsi and dra74x.dtsi */
+			status = "disabled";
+			ti,hwmods = "dss_core";
+			/* CTRL_CORE_DSS_PLL_CONTROL */
+			syscon-pll-ctrl = <&scm_conf 0x538>;
+			#address-cells = <1>;
+			#size-cells = <1>;
+			ranges;
+
+			dispc@58001000 {
+				compatible = "ti,dra7-dispc";
+				reg = <0x58001000 0x1000>;
+				interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>;
+				ti,hwmods = "dss_dispc";
+				clocks = <&dss_dss_clk>;
+				clock-names = "fck";
+				/* CTRL_CORE_SMA_SW_1 */
+				syscon-pol = <&scm_conf 0x534>;
+			};
+
+			hdmi: encoder@58060000 {
+				compatible = "ti,dra7-hdmi";
+				reg = <0x58040000 0x200>,
+				      <0x58040200 0x80>,
+				      <0x58040300 0x80>,
+				      <0x58060000 0x19000>;
+				reg-names = "wp", "pll", "phy", "core";
+				interrupts = <GIC_SPI 96 IRQ_TYPE_LEVEL_HIGH>;
+				status = "disabled";
+				ti,hwmods = "dss_hdmi";
+				clocks = <&dss_48mhz_clk>, <&dss_hdmi_clk>;
+				clock-names = "fck", "sys_clk";
+			};
+		};
 	};
 
 	thermal_zones: thermal-zones {
diff --git a/arch/arm/boot/dts/dra72-evm.dts b/arch/arm/boot/dts/dra72-evm.dts
index ce0390f081d9..4e1b60581782 100644
--- a/arch/arm/boot/dts/dra72-evm.dts
+++ b/arch/arm/boot/dts/dra72-evm.dts
@@ -19,6 +19,10 @@
 		reg = <0x80000000 0x40000000>; /* 1024 MB */
 	};
 
+	aliases {
+		display0 = &hdmi0;
+	};
+
 	evm_3v3: fixedregulator-evm_3v3 {
 		compatible = "regulator-fixed";
 		regulator-name = "evm_3v3";
@@ -35,6 +39,51 @@
 		compatible = "linux,extcon-usb-gpio";
 		id-gpio = <&pcf_gpio_21 2 GPIO_ACTIVE_HIGH>;
 	};
+
+	hdmi0: connector {
+		compatible = "hdmi-connector";
+		label = "hdmi";
+
+		type = "a";
+
+		port {
+			hdmi_connector_in: endpoint {
+				remote-endpoint = <&tpd12s015_out>;
+			};
+		};
+	};
+
+	tpd12s015: encoder {
+		compatible = "ti,tpd12s015";
+
+		pinctrl-names = "default";
+		pinctrl-0 = <&tpd12s015_pins>;
+
+		gpios = <&pcf_hdmi 4 GPIO_ACTIVE_HIGH>,	/* P4, CT CP HPD */
+			<&pcf_hdmi 5 GPIO_ACTIVE_HIGH>,	/* P5, LS OE */
+			<&gpio7 12 GPIO_ACTIVE_HIGH>;	/* gpio7_12/sp1_cs2, HPD */
+
+		ports {
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			port@0 {
+				reg = <0>;
+
+				tpd12s015_in: endpoint {
+					remote-endpoint = <&hdmi_out>;
+				};
+			};
+
+			port@1 {
+				reg = <1>;
+
+				tpd12s015_out: endpoint {
+					remote-endpoint = <&hdmi_connector_in>;
+				};
+			};
+		};
+	};
 };
 
 &dra7_pmx_core {
@@ -45,6 +94,13 @@
 		>;
 	};
 
+	i2c5_pins: pinmux_i2c5_pins {
+		pinctrl-single,pins = <
+			0x2b4 (PIN_INPUT | MUX_MODE10) /* mcasp1_axr0.i2c5_sda */
+			0x2b8 (PIN_INPUT | MUX_MODE10) /* mcasp1_axr1.i2c5_scl */
+		>;
+	};
+
 	nand_default: nand_default {
 		pinctrl-single,pins = <
 			0x0	(PIN_INPUT  | MUX_MODE0) /* gpmc_ad0 */
@@ -142,6 +198,19 @@
 			0xb8 (PIN_OUTPUT | MUX_MODE1)	/* gpmc_cs2.qspi1_cs0 */
 		>;
 	};
+
+	hdmi_pins: pinmux_hdmi_pins {
+		pinctrl-single,pins = <
+			0x408 (PIN_INPUT | MUX_MODE1) /* i2c2_sda.hdmi1_ddc_scl */
+			0x40c (PIN_INPUT | MUX_MODE1) /* i2c2_scl.hdmi1_ddc_sda */
+		>;
+	};
+
+	tpd12s015_pins: pinmux_tpd12s015_pins {
+		pinctrl-single,pins = <
+			0x3b8 (PIN_INPUT_PULLDOWN | MUX_MODE14) /* gpio7_12 HPD */
+		>;
+	};
 };
 
 &i2c1 {
@@ -277,6 +346,27 @@
 	};
 };
 
+&i2c5 {
+	status = "okay";
+	pinctrl-names = "default";
+	pinctrl-0 = <&i2c5_pins>;
+	clock-frequency = <400000>;
+
+	pcf_hdmi: pcf8575@26 {
+		compatible = "nxp,pcf8575";
+		reg = <0x26>;
+		gpio-controller;
+		#gpio-cells = <2>;
+		/*
+		 * initial state is used here to keep the mdio interface
+		 * selected on RU89 through SEL_VIN4_MUX_S0, VIN2_S1 and
+		 * VIN2_S0 driven high otherwise Ethernet stops working
+		 * VIN6_SEL_S0 is low, thus selecting McASP3 over VIN6
+		 */
+		lines-initial-states = <0x0f2b>;
+	};
+};
+
 &uart1 {
 	status = "okay";
 };
@@ -566,3 +656,23 @@
 		};
 	};
 };
+
+&dss {
+	status = "ok";
+
+	vdda_video-supply = <&ldo5_reg>;
+};
+
+&hdmi {
+	status = "ok";
+	vdda-supply = <&ldo3_reg>;
+
+	pinctrl-names = "default";
+	pinctrl-0 = <&hdmi_pins>;
+
+	port {
+		hdmi_out: endpoint {
+			remote-endpoint = <&tpd12s015_in>;
+		};
+	};
+};
diff --git a/arch/arm/boot/dts/dra72x.dtsi b/arch/arm/boot/dts/dra72x.dtsi
index 03d742f8d572..eaca143faa77 100644
--- a/arch/arm/boot/dts/dra72x.dtsi
+++ b/arch/arm/boot/dts/dra72x.dtsi
@@ -34,3 +34,14 @@
 		interrupts = <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>;
 	};
 };
+
+&dss {
+	reg = <0x58000000 0x80>,
+	      <0x58004054 0x4>,
+	      <0x58004300 0x20>;
+	reg-names = "dss", "pll1_clkctrl", "pll1";
+
+	clocks = <&dss_dss_clk>,
+		 <&dss_video1_clk>;
+	clock-names = "fck", "video1_clk";
+};
diff --git a/arch/arm/boot/dts/dra74x.dtsi b/arch/arm/boot/dts/dra74x.dtsi
index cc560a70926f..fa995d0ca1f2 100644
--- a/arch/arm/boot/dts/dra74x.dtsi
+++ b/arch/arm/boot/dts/dra74x.dtsi
@@ -73,3 +73,18 @@
 		};
 	};
 };
+
+&dss {
+	reg = <0x58000000 0x80>,
+	      <0x58004054 0x4>,
+	      <0x58004300 0x20>,
+	      <0x58005054 0x4>,
+	      <0x58005300 0x20>;
+	reg-names = "dss", "pll1_clkctrl", "pll1",
+		    "pll2_clkctrl", "pll2";
+
+	clocks = <&dss_dss_clk>,
+		 <&dss_video1_clk>,
+		 <&dss_video2_clk>;
+	clock-names = "fck", "video1_clk", "video2_clk";
+};
diff --git a/arch/arm/boot/dts/dra7xx-clocks.dtsi b/arch/arm/boot/dts/dra7xx-clocks.dtsi
index 3b933f74d000..357bedeebfac 100644
--- a/arch/arm/boot/dts/dra7xx-clocks.dtsi
+++ b/arch/arm/boot/dts/dra7xx-clocks.dtsi
@@ -1531,6 +1531,7 @@
 		clocks = <&dpll_per_h12x2_ck>;
 		ti,bit-shift = <8>;
 		reg = <0x1120>;
+		ti,set-rate-parent;
 	};
 
 	dss_hdmi_clk: dss_hdmi_clk {
@@ -2136,3 +2137,13 @@
 		clocks = <&dpll_usb_ck>;
 	};
 };
+
+&scm_conf_clocks {
+	dss_deshdcp_clk: dss_deshdcp_clk {
+		#clock-cells = <0>;
+		compatible = "ti,gate-clock";
+		clocks = <&l3_iclk_div>;
+		ti,bit-shift = <0>;
+		reg = <0x558>;
+	};
+};
diff --git a/arch/arm/boot/dts/exynos4412-trats2.dts b/arch/arm/boot/dts/exynos4412-trats2.dts
index 173ffa479ad3..792394dd0f2a 100644
--- a/arch/arm/boot/dts/exynos4412-trats2.dts
+++ b/arch/arm/boot/dts/exynos4412-trats2.dts
@@ -736,7 +736,7 @@
 
 			display-timings {
 				timing-0 {
-					clock-frequency = <0>;
+					clock-frequency = <57153600>;
 					hactive = <720>;
 					vactive = <1280>;
 					hfront-porch = <5>;
diff --git a/arch/arm/boot/dts/imx27.dtsi b/arch/arm/boot/dts/imx27.dtsi
index 6951b66d1ab7..bc215e4b75fd 100644
--- a/arch/arm/boot/dts/imx27.dtsi
+++ b/arch/arm/boot/dts/imx27.dtsi
@@ -533,7 +533,7 @@
 
 			fec: ethernet@1002b000 {
 				compatible = "fsl,imx27-fec";
-				reg = <0x1002b000 0x4000>;
+				reg = <0x1002b000 0x1000>;
 				interrupts = <50>;
 				clocks = <&clks IMX27_CLK_FEC_IPG_GATE>,
 					 <&clks IMX27_CLK_FEC_AHB_GATE>;
diff --git a/arch/arm/boot/dts/imx28-cfa10036.dts b/arch/arm/boot/dts/imx28-cfa10036.dts
index b04b6b8850a7..570aa339a05e 100644
--- a/arch/arm/boot/dts/imx28-cfa10036.dts
+++ b/arch/arm/boot/dts/imx28-cfa10036.dts
@@ -99,6 +99,9 @@
 					solomon,height = <32>;
 					solomon,width = <128>;
 					solomon,page-offset = <0>;
+					solomon,com-lrremap;
+					solomon,com-invdir;
+					solomon,com-offset = <32>;
 				};
 			};
 
diff --git a/arch/arm/boot/dts/omap3-devkit8000.dts b/arch/arm/boot/dts/omap3-devkit8000.dts
index 134d3f27a8ec..921de6605f07 100644
--- a/arch/arm/boot/dts/omap3-devkit8000.dts
+++ b/arch/arm/boot/dts/omap3-devkit8000.dts
@@ -110,6 +110,8 @@
 	nand@0,0 {
 		reg = <0 0 4>; /* CS0, offset 0, IO size 4 */
 		nand-bus-width = <16>;
+		gpmc,device-width = <2>;
+		ti,nand-ecc-opt = "sw";
 
 		gpmc,sync-clk-ps = <0>;
 		gpmc,cs-on-ns = <0>;
diff --git a/arch/arm/boot/dts/omap3-n900.dts b/arch/arm/boot/dts/omap3-n900.dts
index 5c16145920ea..5f5e0f3d5b64 100644
--- a/arch/arm/boot/dts/omap3-n900.dts
+++ b/arch/arm/boot/dts/omap3-n900.dts
@@ -832,8 +832,8 @@
 		touchscreen-fuzz-x = <4>;
 		touchscreen-fuzz-y = <7>;
 		touchscreen-fuzz-pressure = <2>;
-		touchscreen-max-x = <4096>;
-		touchscreen-max-y = <4096>;
+		touchscreen-size-x = <4096>;
+		touchscreen-size-y = <4096>;
 		touchscreen-max-pressure = <2048>;
 
 		ti,x-plate-ohms = <280>;
diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi b/arch/arm/boot/dts/socfpga_arria10.dtsi
index 8a05c47fd57f..4be75960a603 100644
--- a/arch/arm/boot/dts/socfpga_arria10.dtsi
+++ b/arch/arm/boot/dts/socfpga_arria10.dtsi
@@ -253,6 +253,17 @@
 			status = "disabled";
 		};
 
+		sdr: sdr@ffc25000 {
+			compatible = "syscon";
+			reg = <0xffcfb100 0x80>;
+		};
+
+		sdramedac {
+			compatible = "altr,sdram-edac-a10";
+			altr,sdr-syscon = <&sdr>;
+			interrupts = <0 2 4>, <0 0 4>;
+		};
+
 		L2: l2-cache@fffff000 {
 			compatible = "arm,pl310-cache";
 			reg = <0xfffff000 0x1000>;
diff --git a/arch/arm/boot/dts/zynq-7000.dtsi b/arch/arm/boot/dts/zynq-7000.dtsi
index a5cd2eda3edf..9ea54b3dba09 100644
--- a/arch/arm/boot/dts/zynq-7000.dtsi
+++ b/arch/arm/boot/dts/zynq-7000.dtsi
@@ -193,7 +193,7 @@
 		};
 
 		gem0: ethernet@e000b000 {
-			compatible = "cdns,gem";
+			compatible = "cdns,zynq-gem";
 			reg = <0xe000b000 0x1000>;
 			status = "disabled";
 			interrupts = <0 22 4>;
@@ -204,7 +204,7 @@
 		};
 
 		gem1: ethernet@e000c000 {
-			compatible = "cdns,gem";
+			compatible = "cdns,zynq-gem";
 			reg = <0xe000c000 0x1000>;
 			status = "disabled";
 			interrupts = <0 45 4>;
diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c
index 5cc779c8e9c6..93ee70dbbdd3 100644
--- a/arch/arm/common/sa1111.c
+++ b/arch/arm/common/sa1111.c
@@ -501,8 +501,8 @@ static int sa1111_setup_irq(struct sa1111 *sachip, unsigned irq_base)
 	 * Register SA1111 interrupt
 	 */
 	irq_set_irq_type(sachip->irq, IRQ_TYPE_EDGE_RISING);
-	irq_set_handler_data(sachip->irq, sachip);
-	irq_set_chained_handler(sachip->irq, sa1111_irq_handler);
+	irq_set_chained_handler_and_data(sachip->irq, sa1111_irq_handler,
+					 sachip);
 
 	dev_info(sachip->dev, "Providing IRQ%u-%u\n",
 		sachip->irq_base, sachip->irq_base + SA1111_IRQ_NR - 1);
@@ -836,8 +836,7 @@ static void __sa1111_remove(struct sa1111 *sachip)
 	clk_unprepare(sachip->clk);
 
 	if (sachip->irq != NO_IRQ) {
-		irq_set_chained_handler(sachip->irq, NULL);
-		irq_set_handler_data(sachip->irq, NULL);
+		irq_set_chained_handler_and_data(sachip->irq, NULL, NULL);
 		irq_free_descs(sachip->irq_base, SA1111_IRQ_NR);
 
 		release_mem_region(sachip->phys + SA1111_INTC, 512);
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index 0ca4a3eaf65d..fbbb1915c6a9 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -429,7 +429,7 @@ CONFIG_USB_EHCI_EXYNOS=y
 CONFIG_USB_EHCI_TEGRA=y
 CONFIG_USB_EHCI_HCD_STI=y
 CONFIG_USB_EHCI_HCD_PLATFORM=y
-CONFIG_USB_ISP1760_HCD=y
+CONFIG_USB_ISP1760=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_OHCI_HCD_STI=y
 CONFIG_USB_OHCI_HCD_PLATFORM=y
diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index 8da2207b0072..27ed1b1cd1d7 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -53,20 +53,13 @@ config CRYPTO_SHA256_ARM
 	  SHA-256 secure hash standard (DFIPS 180-2) implemented
 	  using optimized ARM assembler and NEON, when available.
 
-config CRYPTO_SHA512_ARM_NEON
-	tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_SHA512
+config CRYPTO_SHA512_ARM
+	tristate "SHA-384/512 digest algorithm (ARM-asm and NEON)"
 	select CRYPTO_HASH
+	depends on !CPU_V7M
 	help
 	  SHA-512 secure hash standard (DFIPS 180-2) implemented
-	  using ARM NEON instructions, when available.
-
-	  This version of SHA implements a 512 bit hash with 256 bits of
-	  security against collision attacks.
-
-	  This code also includes SHA-384, a 384 bit hash with 192 bits
-	  of security against collision attacks.
+	  using optimized ARM assembler and NEON, when available.
 
 config CRYPTO_AES_ARM
 	tristate "AES cipher algorithms (ARM-asm)"
diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
index 6ea828241fcb..fc5150702b64 100644
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
-obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@@ -30,7 +30,8 @@ sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
 sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
 sha256-arm-y	:= sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
-sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
+sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
+sha512-arm-y	:= sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
 sha1-arm-ce-y	:= sha1-ce-core.o sha1-ce-glue.o
 sha2-arm-ce-y	:= sha2-ce-core.o sha2-ce-glue.o
 aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
@@ -45,4 +46,7 @@ $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
 	$(call cmd,perl)
 
-.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
+$(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
+	$(call cmd,perl)
+
+.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S $(obj)/sha512-core.S
diff --git a/arch/arm/crypto/aes-ce-core.S b/arch/arm/crypto/aes-ce-core.S
index 8cfa468ee570..987aa632c9f0 100644
--- a/arch/arm/crypto/aes-ce-core.S
+++ b/arch/arm/crypto/aes-ce-core.S
@@ -101,15 +101,14 @@
 	\dround		q10, q11
 	blo		0f			@ AES-128: 10 rounds
 	vld1.8		{q10-q11}, [ip]!
-	beq		1f			@ AES-192: 12 rounds
 	\dround		q12, q13
+	beq		1f			@ AES-192: 12 rounds
 	vld1.8		{q12-q13}, [ip]
 	\dround		q10, q11
 0:	\fround		q12, q13, q14
 	bx		lr
 
-1:	\dround		q12, q13
-	\fround		q10, q11, q14
+1:	\fround		q10, q11, q14
 	bx		lr
 	.endm
 
@@ -122,8 +121,8 @@
 	 *   q2        : third in/output block (_3x version only)
 	 *   q8        : first round key
 	 *   q9        : secound round key
-	 *   ip        : address of 3rd round key
 	 *   q14       : final round key
+	 *   r2        : address of round key array
 	 *   r3        : number of rounds
 	 */
 	.align		6
diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl
new file mode 100644
index 000000000000..a2b11a844357
--- /dev/null
+++ b/arch/arm/crypto/sha512-armv4.pl
@@ -0,0 +1,649 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
+# ====================================================================
+
+# SHA512 block procedure for ARMv4. September 2007.
+
+# This code is ~4.5 (four and a half) times faster than code generated
+# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+# Xscale PXA250 core].
+#
+# July 2010.
+#
+# Rescheduling for dual-issue pipeline resulted in 6% improvement on
+# Cortex A8 core and ~40 cycles per processed byte.
+
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+# August 2012.
+#
+# Improve NEON performance by 12% on Snapdragon S4. In absolute
+# terms it's 22.6 cycles per byte, which is disappointing result.
+# Technical writers asserted that 3-way S4 pipeline can sustain
+# multiple NEON instructions per cycle, but dual NEON issue could
+# not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+# for further details. On side note Cortex-A15 processes one byte in
+# 16 cycles.
+
+# Byte order [in]dependence. =========================================
+#
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
+# ====================================================================
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$ctx="r0";	# parameter block
+$inp="r1";
+$len="r2";
+
+$Tlo="r3";
+$Thi="r4";
+$Alo="r5";
+$Ahi="r6";
+$Elo="r7";
+$Ehi="r8";
+$t0="r9";
+$t1="r10";
+$t2="r11";
+$t3="r12";
+############	r13 is stack pointer
+$Ktbl="r14";
+############	r15 is program counter
+
+$Aoff=8*0;
+$Boff=8*1;
+$Coff=8*2;
+$Doff=8*3;
+$Eoff=8*4;
+$Foff=8*5;
+$Goff=8*6;
+$Hoff=8*7;
+$Xoff=8*8;
+
+sub BODY_00_15() {
+my $magic = shift;
+$code.=<<___;
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
+	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
+	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
+	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
+	eor	$t0,$t0,$Elo,lsr#18
+	eor	$t1,$t1,$Ehi,lsr#18
+	eor	$t0,$t0,$Ehi,lsl#14
+	eor	$t1,$t1,$Elo,lsl#14
+	eor	$t0,$t0,$Ehi,lsr#9
+	eor	$t1,$t1,$Elo,lsr#9
+	eor	$t0,$t0,$Elo,lsl#23
+	eor	$t1,$t1,$Ehi,lsl#23	@ Sigma1(e)
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#$Foff+0]	@ f.lo
+	adc	$Thi,$Thi,$t1		@ T += Sigma1(e)
+	ldr	$t1,[sp,#$Foff+4]	@ f.hi
+	adds	$Tlo,$Tlo,$t2
+	ldr	$t2,[sp,#$Goff+0]	@ g.lo
+	adc	$Thi,$Thi,$t3		@ T += h
+	ldr	$t3,[sp,#$Goff+4]	@ g.hi
+
+	eor	$t0,$t0,$t2
+	str	$Elo,[sp,#$Eoff+0]
+	eor	$t1,$t1,$t3
+	str	$Ehi,[sp,#$Eoff+4]
+	and	$t0,$t0,$Elo
+	str	$Alo,[sp,#$Aoff+0]
+	and	$t1,$t1,$Ehi
+	str	$Ahi,[sp,#$Aoff+4]
+	eor	$t0,$t0,$t2
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
+	eor	$t1,$t1,$t3		@ Ch(e,f,g)
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
+
+	adds	$Tlo,$Tlo,$t0
+	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
+	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
+	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
+	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
+	adc	$Thi,$Thi,$t3		@ T += K[i]
+	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
+	adc	$Ehi,$Ehi,$Thi		@ d += T
+	teq	$t0,#$magic
+
+	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	$Ktbl,$Ktbl,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	$t0,$Alo,lsr#28
+	mov	$t1,$Ahi,lsr#28
+	eor	$t0,$t0,$Ahi,lsl#4
+	eor	$t1,$t1,$Alo,lsl#4
+	eor	$t0,$t0,$Ahi,lsr#2
+	eor	$t1,$t1,$Alo,lsr#2
+	eor	$t0,$t0,$Alo,lsl#30
+	eor	$t1,$t1,$Ahi,lsl#30
+	eor	$t0,$t0,$Ahi,lsr#7
+	eor	$t1,$t1,$Alo,lsr#7
+	eor	$t0,$t0,$Alo,lsl#25
+	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
+	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
+	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
+
+	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
+	ldr	$t2,[sp,#$Coff+4]	@ c.hi
+	and	$Alo,$Alo,$t3
+	and	$t3,$Ahi,$t1
+	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
+	and	$Ahi,$Ahi,$t2
+	adds	$Alo,$Alo,$Tlo
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
+	add	$Ktbl,$Ktbl,#8
+___
+}
+$code=<<___;
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,sha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	sub	$Ktbl,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	$Elo,[$ctx,#$Eoff+$lo]
+	ldr	$Ehi,[$ctx,#$Eoff+$hi]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+.Loop:
+	str	$t0, [sp,#$Goff+0]
+	str	$t1, [sp,#$Goff+4]
+	str	$t2, [sp,#$Hoff+0]
+	str	$t3, [sp,#$Hoff+4]
+	ldr	$Alo,[$ctx,#$Aoff+$lo]
+	ldr	$Ahi,[$ctx,#$Aoff+$hi]
+	ldr	$Tlo,[$ctx,#$Boff+$lo]
+	ldr	$Thi,[$ctx,#$Boff+$hi]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	str	$Tlo,[sp,#$Boff+0]
+	str	$Thi,[sp,#$Boff+4]
+	str	$t0, [sp,#$Coff+0]
+	str	$t1, [sp,#$Coff+4]
+	str	$t2, [sp,#$Doff+0]
+	str	$t3, [sp,#$Doff+4]
+	ldr	$Tlo,[$ctx,#$Foff+$lo]
+	ldr	$Thi,[$ctx,#$Foff+$hi]
+	str	$Tlo,[sp,#$Foff+0]
+	str	$Thi,[sp,#$Foff+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	$Tlo,[$inp,#7]
+	ldrb	$t0, [$inp,#6]
+	ldrb	$t1, [$inp,#5]
+	ldrb	$t2, [$inp,#4]
+	ldrb	$Thi,[$inp,#3]
+	ldrb	$t3, [$inp,#2]
+	orr	$Tlo,$Tlo,$t0,lsl#8
+	ldrb	$t0, [$inp,#1]
+	orr	$Tlo,$Tlo,$t1,lsl#16
+	ldrb	$t1, [$inp],#8
+	orr	$Tlo,$Tlo,$t2,lsl#24
+	orr	$Thi,$Thi,$t3,lsl#8
+	orr	$Thi,$Thi,$t0,lsl#16
+	orr	$Thi,$Thi,$t1,lsl#24
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
+___
+	&BODY_00_15(0x94);
+$code.=<<___;
+	tst	$Ktbl,#1
+	beq	.L00_15
+	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
+	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
+	eor	$Tlo,$Tlo,$t1,lsl#31
+	eor	$Thi,$Thi,$t0,lsl#31
+	eor	$Tlo,$Tlo,$t0,lsr#8
+	eor	$Thi,$Thi,$t1,lsr#8
+	eor	$Tlo,$Tlo,$t1,lsl#24
+	eor	$Thi,$Thi,$t0,lsl#24
+	eor	$Tlo,$Tlo,$t0,lsr#7
+	eor	$Thi,$Thi,$t1,lsr#7
+	eor	$Tlo,$Tlo,$t1,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	$t0,$t2,lsr#19
+	mov	$t1,$t3,lsr#19
+	eor	$t0,$t0,$t3,lsl#13
+	eor	$t1,$t1,$t2,lsl#13
+	eor	$t0,$t0,$t3,lsr#29
+	eor	$t1,$t1,$t2,lsr#29
+	eor	$t0,$t0,$t2,lsl#3
+	eor	$t1,$t1,$t3,lsl#3
+	eor	$t0,$t0,$t2,lsr#6
+	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
+	eor	$t0,$t0,$t3,lsl#26
+
+	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
+	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
+	adc	$Thi,$Thi,$t1
+
+	ldr	$t1,[sp,#`$Xoff+8*16`+4]
+	adds	$Tlo,$Tlo,$t2
+	adc	$Thi,$Thi,$t3
+	adds	$Tlo,$Tlo,$t0
+	adc	$Thi,$Thi,$t1
+___
+	&BODY_00_15(0x17);
+$code.=<<___;
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
+	beq	.L16_79
+	bic	$Ktbl,$Ktbl,#1
+
+	ldr	$Tlo,[sp,#$Boff+0]
+	ldr	$Thi,[sp,#$Boff+4]
+	ldr	$t0, [$ctx,#$Aoff+$lo]
+	ldr	$t1, [$ctx,#$Aoff+$hi]
+	ldr	$t2, [$ctx,#$Boff+$lo]
+	ldr	$t3, [$ctx,#$Boff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Boff+$hi]
+
+	ldr	$Alo,[sp,#$Coff+0]
+	ldr	$Ahi,[sp,#$Coff+4]
+	ldr	$Tlo,[sp,#$Doff+0]
+	ldr	$Thi,[sp,#$Doff+4]
+	ldr	$t0, [$ctx,#$Coff+$lo]
+	ldr	$t1, [$ctx,#$Coff+$hi]
+	ldr	$t2, [$ctx,#$Doff+$lo]
+	ldr	$t3, [$ctx,#$Doff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Doff+$hi]
+
+	ldr	$Tlo,[sp,#$Foff+0]
+	ldr	$Thi,[sp,#$Foff+4]
+	ldr	$t0, [$ctx,#$Eoff+$lo]
+	ldr	$t1, [$ctx,#$Eoff+$hi]
+	ldr	$t2, [$ctx,#$Foff+$lo]
+	ldr	$t3, [$ctx,#$Foff+$hi]
+	adds	$Elo,$Elo,$t0
+	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
+	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Foff+$hi]
+
+	ldr	$Alo,[sp,#$Goff+0]
+	ldr	$Ahi,[sp,#$Goff+4]
+	ldr	$Tlo,[sp,#$Hoff+0]
+	ldr	$Thi,[sp,#$Hoff+4]
+	ldr	$t0, [$ctx,#$Goff+$lo]
+	ldr	$t1, [$ctx,#$Goff+$hi]
+	ldr	$t2, [$ctx,#$Hoff+$lo]
+	ldr	$t3, [$ctx,#$Hoff+$hi]
+	adds	$t0,$Alo,$t0
+	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
+	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
+	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
+	str	$t3, [$ctx,#$Hoff+$hi]
+
+	add	sp,sp,#640
+	sub	$Ktbl,$Ktbl,#640
+
+	teq	$inp,$len
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+#if $i>0
+	 vadd.i64	$a,$Maj			@ h+=Maj from the past
+#endif
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vmov		$Ch,$e
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	veor		$t1,$t0
+	vbsl		$Ch,$f,$g		@ Ch(e,f,g)
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	veor		$t2,$t1			@ Sigma1(e)
+	vadd.i64	$T1,$Ch,$h
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vadd.i64	$T1,$t2
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vadd.i64	$K,@X[$i%16]
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	veor		$Maj,$a,$b
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	veor		$h,$t0,$t1
+	vadd.i64	$T1,$K
+	vbsl		$Maj,$c,$b		@ Maj(a,b,c)
+	veor		$h,$t2			@ Sigma0(a)
+	vadd.i64	$d,$T1
+	vadd.i64	$Maj,$T1
+	@ vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	 vadd.i64	@_[0],d30			@ h+=Maj from the past
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+	VFP_ABI_PUSH
+	adrl	$Ktbl,K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	 vadd.i64	$A,d30		@ h+=Maj from the past
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	ret				@ bx lr
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+
+open SELF,$0;
+while(<SELF>) {
+	next if (/^#!/);
+	last if (!s/^#/@/ and !/^$/);
+	print;
+}
+close SELF;
+
+print $code;
+close STDOUT; # enforce flush
diff --git a/arch/arm/crypto/sha512-armv7-neon.S b/arch/arm/crypto/sha512-armv7-neon.S
deleted file mode 100644
index fe99472e507c..000000000000
--- a/arch/arm/crypto/sha512-armv7-neon.S
+++ /dev/null
@@ -1,455 +0,0 @@
-/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
- *
- * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-
-#include <linux/linkage.h>
-
-
-.syntax unified
-.code   32
-.fpu neon
-
-.text
-
-/* structure of SHA512_CONTEXT */
-#define hd_a 0
-#define hd_b ((hd_a) + 8)
-#define hd_c ((hd_b) + 8)
-#define hd_d ((hd_c) + 8)
-#define hd_e ((hd_d) + 8)
-#define hd_f ((hd_e) + 8)
-#define hd_g ((hd_f) + 8)
-
-/* register macros */
-#define RK %r2
-
-#define RA d0
-#define RB d1
-#define RC d2
-#define RD d3
-#define RE d4
-#define RF d5
-#define RG d6
-#define RH d7
-
-#define RT0 d8
-#define RT1 d9
-#define RT2 d10
-#define RT3 d11
-#define RT4 d12
-#define RT5 d13
-#define RT6 d14
-#define RT7 d15
-
-#define RT01q q4
-#define RT23q q5
-#define RT45q q6
-#define RT67q q7
-
-#define RW0 d16
-#define RW1 d17
-#define RW2 d18
-#define RW3 d19
-#define RW4 d20
-#define RW5 d21
-#define RW6 d22
-#define RW7 d23
-#define RW8 d24
-#define RW9 d25
-#define RW10 d26
-#define RW11 d27
-#define RW12 d28
-#define RW13 d29
-#define RW14 d30
-#define RW15 d31
-
-#define RW01q q8
-#define RW23q q9
-#define RW45q q10
-#define RW67q q11
-#define RW89q q12
-#define RW1011q q13
-#define RW1213q q14
-#define RW1415q q15
-
-/***********************************************************************
- * ARM assembly implementation of sha512 transform
- ***********************************************************************/
-#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
-                     rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
-	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
-	vshr.u64 RT2, re, #14; \
-	vshl.u64 RT3, re, #64 - 14; \
-	interleave_op(arg1); \
-	vshr.u64 RT4, re, #18; \
-	vshl.u64 RT5, re, #64 - 18; \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, re, #41; \
-	vshl.u64 RT5, re, #64 - 41; \
-	vadd.u64 RT0, RT0, rw0; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, re; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, rf, rg; \
-	\
-	vadd.u64 RT1, RT1, rh; \
-	vshr.u64 RT2, ra, #28; \
-	vshl.u64 RT3, ra, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, ra, #34; \
-	vshl.u64 RT5, ra, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* h = Sum0 (a) + Maj (a, b, c); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, ra, #39; \
-	vshl.u64 RT5, ra, #64 - 39; \
-	veor.64 RT0, ra, rb; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rc, rb; \
-	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
-	veor.64 rh, RT2, RT3; \
-	\
-	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
-	vshr.u64 RT2, rd, #14; \
-	vshl.u64 RT3, rd, #64 - 14; \
-	vadd.u64 rh, rh, RT0; \
-	vshr.u64 RT4, rd, #18; \
-	vshl.u64 RT5, rd, #64 - 18; \
-	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rd, #41; \
-	vshl.u64 RT5, rd, #64 - 41; \
-	vadd.u64 RT0, RT0, rw1; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, rd; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, re, rf; \
-	\
-	vadd.u64 RT1, RT1, rg; \
-	vshr.u64 RT2, rh, #28; \
-	vshl.u64 RT3, rh, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, rh, #34; \
-	vshl.u64 RT5, rh, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* g = Sum0 (h) + Maj (h, a, b); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rh, #39; \
-	vshl.u64 RT5, rh, #64 - 39; \
-	veor.64 RT0, rh, ra; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rb, ra; \
-	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
-	veor.64 rg, RT2, RT3; \
-	\
-	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
-	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
-	\
-	/**** S0(w[1:2]) */ \
-	\
-	/* w[0:1] += w[9:10] */ \
-	/* RT23q = rw1:rw2 */ \
-	vext.u64 RT23q, rw01q, rw23q, #1; \
-	vadd.u64 rw0, rw9; \
-	vadd.u64 rg, rg, RT0; \
-	vadd.u64 rw1, rw10;\
-	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
-	\
-	vshr.u64 RT45q, RT23q, #1; \
-	vshl.u64 RT67q, RT23q, #64 - 1; \
-	vshr.u64 RT01q, RT23q, #8; \
-	veor.u64 RT45q, RT45q, RT67q; \
-	vshl.u64 RT67q, RT23q, #64 - 8; \
-	veor.u64 RT45q, RT45q, RT01q; \
-	vshr.u64 RT01q, RT23q, #7; \
-	veor.u64 RT45q, RT45q, RT67q; \
-	\
-	/**** S1(w[14:15]) */ \
-	vshr.u64 RT23q, rw1415q, #6; \
-	veor.u64 RT01q, RT01q, RT45q; \
-	vshr.u64 RT45q, rw1415q, #19; \
-	vshl.u64 RT67q, rw1415q, #64 - 19; \
-	veor.u64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT45q, rw1415q, #61; \
-	veor.u64 RT23q, RT23q, RT67q; \
-	vshl.u64 RT67q, rw1415q, #64 - 61; \
-	veor.u64 RT23q, RT23q, RT45q; \
-	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
-	veor.u64 RT01q, RT23q, RT67q;
-#define vadd_RT01q(rw01q) \
-	/* w[0:1] += S(w[14:15]) */ \
-	vadd.u64 rw01q, RT01q;
-
-#define dummy(_) /*_*/
-
-#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
-	              interleave_op1, arg1, interleave_op2, arg2) \
-	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
-	vshr.u64 RT2, re, #14; \
-	vshl.u64 RT3, re, #64 - 14; \
-	interleave_op1(arg1); \
-	vshr.u64 RT4, re, #18; \
-	vshl.u64 RT5, re, #64 - 18; \
-	interleave_op2(arg2); \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, re, #41; \
-	vshl.u64 RT5, re, #64 - 41; \
-	vadd.u64 RT0, RT0, rw0; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, re; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, rf, rg; \
-	\
-	vadd.u64 RT1, RT1, rh; \
-	vshr.u64 RT2, ra, #28; \
-	vshl.u64 RT3, ra, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, ra, #34; \
-	vshl.u64 RT5, ra, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* h = Sum0 (a) + Maj (a, b, c); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, ra, #39; \
-	vshl.u64 RT5, ra, #64 - 39; \
-	veor.64 RT0, ra, rb; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rc, rb; \
-	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
-	veor.64 rh, RT2, RT3; \
-	\
-	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
-	vshr.u64 RT2, rd, #14; \
-	vshl.u64 RT3, rd, #64 - 14; \
-	vadd.u64 rh, rh, RT0; \
-	vshr.u64 RT4, rd, #18; \
-	vshl.u64 RT5, rd, #64 - 18; \
-	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
-	vld1.64 {RT0}, [RK]!; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rd, #41; \
-	vshl.u64 RT5, rd, #64 - 41; \
-	vadd.u64 RT0, RT0, rw1; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vmov.64 RT7, rd; \
-	veor.64 RT1, RT2, RT3; \
-	vbsl.64 RT7, re, rf; \
-	\
-	vadd.u64 RT1, RT1, rg; \
-	vshr.u64 RT2, rh, #28; \
-	vshl.u64 RT3, rh, #64 - 28; \
-	vadd.u64 RT1, RT1, RT0; \
-	vshr.u64 RT4, rh, #34; \
-	vshl.u64 RT5, rh, #64 - 34; \
-	vadd.u64 RT1, RT1, RT7; \
-	\
-	/* g = Sum0 (h) + Maj (h, a, b); */ \
-	veor.64 RT23q, RT23q, RT45q; \
-	vshr.u64 RT4, rh, #39; \
-	vshl.u64 RT5, rh, #64 - 39; \
-	veor.64 RT0, rh, ra; \
-	veor.64 RT23q, RT23q, RT45q; \
-	vbsl.64 RT0, rb, ra; \
-	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
-	veor.64 rg, RT2, RT3;
-#define vadd_rg_RT0(rg) \
-	vadd.u64 rg, rg, RT0;
-#define vadd_rg_RT1(rg) \
-	vadd.u64 rg, rg, RT1; /* g+=t1; */
-
-.align 3
-ENTRY(sha512_transform_neon)
-	/* Input:
-	 *	%r0: SHA512_CONTEXT
-	 *	%r1: data
-	 *	%r2: u64 k[] constants
-	 *	%r3: nblks
-	 */
-	push {%lr};
-
-	mov %lr, #0;
-
-	/* Load context to d0-d7 */
-	vld1.64 {RA-RD}, [%r0]!;
-	vld1.64 {RE-RH}, [%r0];
-	sub %r0, #(4*8);
-
-	/* Load input to w[16], d16-d31 */
-	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
-	vld1.64 {RW0-RW3}, [%r1]!;
-	vld1.64 {RW4-RW7}, [%r1]!;
-	vld1.64 {RW8-RW11}, [%r1]!;
-	vld1.64 {RW12-RW15}, [%r1]!;
-#ifdef __ARMEL__
-	/* byteswap */
-	vrev64.8 RW01q, RW01q;
-	vrev64.8 RW23q, RW23q;
-	vrev64.8 RW45q, RW45q;
-	vrev64.8 RW67q, RW67q;
-	vrev64.8 RW89q, RW89q;
-	vrev64.8 RW1011q, RW1011q;
-	vrev64.8 RW1213q, RW1213q;
-	vrev64.8 RW1415q, RW1415q;
-#endif
-
-	/* EABI says that d8-d15 must be preserved by callee. */
-	/*vpush {RT0-RT7};*/
-
-.Loop:
-	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
-		     RW23q, RW1415q, RW9, RW10, dummy, _);
-	b .Lenter_rounds;
-
-.Loop_rounds:
-	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
-		     RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
-.Lenter_rounds:
-	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
-		     RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
-	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
-		     RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
-	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
-		     RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
-	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
-		     RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
-	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
-		     RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
-	add %lr, #16;
-	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
-		     RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
-	cmp %lr, #64;
-	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
-		     RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
-	bne .Loop_rounds;
-
-	subs %r3, #1;
-
-	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
-		      vadd_RT01q, RW1415q, dummy, _);
-	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
-		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
-	beq .Lhandle_tail;
-	vld1.64 {RW0-RW3}, [%r1]!;
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-#ifdef __ARMEL__
-	vrev64.8 RW01q, RW01q;
-	vrev64.8 RW23q, RW23q;
-#endif
-	vld1.64 {RW4-RW7}, [%r1]!;
-	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
-		      vadd_rg_RT0, RA, vadd_rg_RT1, RA);
-	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
-		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
-#ifdef __ARMEL__
-	vrev64.8 RW45q, RW45q;
-	vrev64.8 RW67q, RW67q;
-#endif
-	vld1.64 {RW8-RW11}, [%r1]!;
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-#ifdef __ARMEL__
-	vrev64.8 RW89q, RW89q;
-	vrev64.8 RW1011q, RW1011q;
-#endif
-	vld1.64 {RW12-RW15}, [%r1]!;
-	vadd_rg_RT0(RA);
-	vadd_rg_RT1(RA);
-
-	/* Load context */
-	vld1.64 {RT0-RT3}, [%r0]!;
-	vld1.64 {RT4-RT7}, [%r0];
-	sub %r0, #(4*8);
-
-#ifdef __ARMEL__
-	vrev64.8 RW1213q, RW1213q;
-	vrev64.8 RW1415q, RW1415q;
-#endif
-
-	vadd.u64 RA, RT0;
-	vadd.u64 RB, RT1;
-	vadd.u64 RC, RT2;
-	vadd.u64 RD, RT3;
-	vadd.u64 RE, RT4;
-	vadd.u64 RF, RT5;
-	vadd.u64 RG, RT6;
-	vadd.u64 RH, RT7;
-
-	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
-	sub RK, $(8*80);
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	mov %lr, #0;
-	sub %r0, #(4*8);
-
-	b .Loop;
-
-.Lhandle_tail:
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
-		      vadd_rg_RT0, RA, vadd_rg_RT1, RA);
-	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
-		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
-	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
-		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
-	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
-		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
-
-	/* Load context to d16-d23 */
-	vld1.64 {RW0-RW3}, [%r0]!;
-	vadd_rg_RT0(RA);
-	vld1.64 {RW4-RW7}, [%r0];
-	vadd_rg_RT1(RA);
-	sub %r0, #(4*8);
-
-	vadd.u64 RA, RW0;
-	vadd.u64 RB, RW1;
-	vadd.u64 RC, RW2;
-	vadd.u64 RD, RW3;
-	vadd.u64 RE, RW4;
-	vadd.u64 RF, RW5;
-	vadd.u64 RG, RW6;
-	vadd.u64 RH, RW7;
-
-	/* Store the first half of context */
-	vst1.64 {RA-RD}, [%r0]!;
-
-	/* Clear used registers */
-	/* d16-d31 */
-	veor.u64 RW01q, RW01q;
-	veor.u64 RW23q, RW23q;
-	veor.u64 RW45q, RW45q;
-	veor.u64 RW67q, RW67q;
-	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	veor.u64 RW89q, RW89q;
-	veor.u64 RW1011q, RW1011q;
-	veor.u64 RW1213q, RW1213q;
-	veor.u64 RW1415q, RW1415q;
-	/* d8-d15 */
-	/*vpop {RT0-RT7};*/
-	/* d0-d7 (q0-q3) */
-	veor.u64 %q0, %q0;
-	veor.u64 %q1, %q1;
-	veor.u64 %q2, %q2;
-	veor.u64 %q3, %q3;
-
-	pop {%pc};
-ENDPROC(sha512_transform_neon)
diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped
new file mode 100644
index 000000000000..3694c4d4ca2b
--- /dev/null
+++ b/arch/arm/crypto/sha512-core.S_shipped
@@ -0,0 +1,1861 @@
+
+@ ====================================================================
+@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+@ project. The module is, however, dual licensed under OpenSSL and
+@ CRYPTOGAMS licenses depending on where you obtain it. For further
+@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
+@ ====================================================================
+
+@ SHA512 block procedure for ARMv4. September 2007.
+
+@ This code is ~4.5 (four and a half) times faster than code generated
+@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
+@ Xscale PXA250 core].
+@
+@ July 2010.
+@
+@ Rescheduling for dual-issue pipeline resulted in 6% improvement on
+@ Cortex A8 core and ~40 cycles per processed byte.
+
+@ February 2011.
+@
+@ Profiler-assisted and platform-specific optimization resulted in 7%
+@ improvement on Coxtex A8 core and ~38 cycles per byte.
+
+@ March 2011.
+@
+@ Add NEON implementation. On Cortex A8 it was measured to process
+@ one byte in 23.3 cycles or ~60% faster than integer-only code.
+
+@ August 2012.
+@
+@ Improve NEON performance by 12% on Snapdragon S4. In absolute
+@ terms it's 22.6 cycles per byte, which is disappointing result.
+@ Technical writers asserted that 3-way S4 pipeline can sustain
+@ multiple NEON instructions per cycle, but dual NEON issue could
+@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
+@ for further details. On side note Cortex-A15 processes one byte in
+@ 16 cycles.
+
+@ Byte order [in]dependence. =========================================
+@
+@ Originally caller was expected to maintain specific *dword* order in
+@ h[0-7], namely with most significant dword at *lower* address, which
+@ was reflected in below two parameters as 0 and 4. Now caller is
+@ expected to maintain native byte order for whole 64-bit values.
+#ifndef __KERNEL__
+# include "arm_arch.h"
+# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
+# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
+#else
+# define __ARM_ARCH__ __LINUX_ARM_ARCH__
+# define __ARM_MAX_ARCH__ 7
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
+.text
+#if __ARM_ARCH__<7
+.code	32
+#else
+.syntax unified
+# ifdef __thumb2__
+#  define adrl adr
+.thumb
+# else
+.code   32
+# endif
+#endif
+
+.type	K512,%object
+.align	5
+K512:
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
+.size	K512,.-K512
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
+#else
+.skip	32
+#endif
+
+.global	sha512_block_data_order
+.type	sha512_block_data_order,%function
+sha512_block_data_order:
+#if __ARM_ARCH__<7
+	sub	r3,pc,#8		@ sha512_block_data_order
+#else
+	adr	r3,sha512_block_data_order
+#endif
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	stmdb	sp!,{r4-r12,lr}
+	sub	r14,r3,#672		@ K512
+	sub	sp,sp,#9*8
+
+	ldr	r7,[r0,#32+LO]
+	ldr	r8,[r0,#32+HI]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+.Loop:
+	str	r9, [sp,#48+0]
+	str	r10, [sp,#48+4]
+	str	r11, [sp,#56+0]
+	str	r12, [sp,#56+4]
+	ldr	r5,[r0,#0+LO]
+	ldr	r6,[r0,#0+HI]
+	ldr	r3,[r0,#8+LO]
+	ldr	r4,[r0,#8+HI]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	str	r3,[sp,#8+0]
+	str	r4,[sp,#8+4]
+	str	r9, [sp,#16+0]
+	str	r10, [sp,#16+4]
+	str	r11, [sp,#24+0]
+	str	r12, [sp,#24+4]
+	ldr	r3,[r0,#40+LO]
+	ldr	r4,[r0,#40+HI]
+	str	r3,[sp,#40+0]
+	str	r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+	ldrb	r3,[r1,#7]
+	ldrb	r9, [r1,#6]
+	ldrb	r10, [r1,#5]
+	ldrb	r11, [r1,#4]
+	ldrb	r4,[r1,#3]
+	ldrb	r12, [r1,#2]
+	orr	r3,r3,r9,lsl#8
+	ldrb	r9, [r1,#1]
+	orr	r3,r3,r10,lsl#16
+	ldrb	r10, [r1],#8
+	orr	r3,r3,r11,lsl#24
+	orr	r4,r4,r12,lsl#8
+	orr	r4,r4,r9,lsl#16
+	orr	r4,r4,r10,lsl#24
+#else
+	ldr	r3,[r1,#4]
+	ldr	r4,[r1],#8
+#ifdef __ARMEL__
+	rev	r3,r3
+	rev	r4,r4
+#endif
+#endif
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#148
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+	tst	r14,#1
+	beq	.L00_15
+	ldr	r9,[sp,#184+0]
+	ldr	r10,[sp,#184+4]
+	bic	r14,r14,#1
+.L16_79:
+	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+	mov	r3,r9,lsr#1
+	ldr	r11,[sp,#80+0]
+	mov	r4,r10,lsr#1
+	ldr	r12,[sp,#80+4]
+	eor	r3,r3,r10,lsl#31
+	eor	r4,r4,r9,lsl#31
+	eor	r3,r3,r9,lsr#8
+	eor	r4,r4,r10,lsr#8
+	eor	r3,r3,r10,lsl#24
+	eor	r4,r4,r9,lsl#24
+	eor	r3,r3,r9,lsr#7
+	eor	r4,r4,r10,lsr#7
+	eor	r3,r3,r10,lsl#25
+
+	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+	mov	r9,r11,lsr#19
+	mov	r10,r12,lsr#19
+	eor	r9,r9,r12,lsl#13
+	eor	r10,r10,r11,lsl#13
+	eor	r9,r9,r12,lsr#29
+	eor	r10,r10,r11,lsr#29
+	eor	r9,r9,r11,lsl#3
+	eor	r10,r10,r12,lsl#3
+	eor	r9,r9,r11,lsr#6
+	eor	r10,r10,r12,lsr#6
+	ldr	r11,[sp,#120+0]
+	eor	r9,r9,r12,lsl#26
+
+	ldr	r12,[sp,#120+4]
+	adds	r3,r3,r9
+	ldr	r9,[sp,#192+0]
+	adc	r4,r4,r10
+
+	ldr	r10,[sp,#192+4]
+	adds	r3,r3,r11
+	adc	r4,r4,r12
+	adds	r3,r3,r9
+	adc	r4,r4,r10
+	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+	mov	r9,r7,lsr#14
+	str	r3,[sp,#64+0]
+	mov	r10,r8,lsr#14
+	str	r4,[sp,#64+4]
+	eor	r9,r9,r8,lsl#18
+	ldr	r11,[sp,#56+0]	@ h.lo
+	eor	r10,r10,r7,lsl#18
+	ldr	r12,[sp,#56+4]	@ h.hi
+	eor	r9,r9,r7,lsr#18
+	eor	r10,r10,r8,lsr#18
+	eor	r9,r9,r8,lsl#14
+	eor	r10,r10,r7,lsl#14
+	eor	r9,r9,r8,lsr#9
+	eor	r10,r10,r7,lsr#9
+	eor	r9,r9,r7,lsl#23
+	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
+	adds	r3,r3,r9
+	ldr	r9,[sp,#40+0]	@ f.lo
+	adc	r4,r4,r10		@ T += Sigma1(e)
+	ldr	r10,[sp,#40+4]	@ f.hi
+	adds	r3,r3,r11
+	ldr	r11,[sp,#48+0]	@ g.lo
+	adc	r4,r4,r12		@ T += h
+	ldr	r12,[sp,#48+4]	@ g.hi
+
+	eor	r9,r9,r11
+	str	r7,[sp,#32+0]
+	eor	r10,r10,r12
+	str	r8,[sp,#32+4]
+	and	r9,r9,r7
+	str	r5,[sp,#0+0]
+	and	r10,r10,r8
+	str	r6,[sp,#0+4]
+	eor	r9,r9,r11
+	ldr	r11,[r14,#LO]	@ K[i].lo
+	eor	r10,r10,r12		@ Ch(e,f,g)
+	ldr	r12,[r14,#HI]	@ K[i].hi
+
+	adds	r3,r3,r9
+	ldr	r7,[sp,#24+0]	@ d.lo
+	adc	r4,r4,r10		@ T += Ch(e,f,g)
+	ldr	r8,[sp,#24+4]	@ d.hi
+	adds	r3,r3,r11
+	and	r9,r11,#0xff
+	adc	r4,r4,r12		@ T += K[i]
+	adds	r7,r7,r3
+	ldr	r11,[sp,#8+0]	@ b.lo
+	adc	r8,r8,r4		@ d += T
+	teq	r9,#23
+
+	ldr	r12,[sp,#16+0]	@ c.lo
+#if __ARM_ARCH__>=7
+	it	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	orreq	r14,r14,#1
+	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+	mov	r9,r5,lsr#28
+	mov	r10,r6,lsr#28
+	eor	r9,r9,r6,lsl#4
+	eor	r10,r10,r5,lsl#4
+	eor	r9,r9,r6,lsr#2
+	eor	r10,r10,r5,lsr#2
+	eor	r9,r9,r5,lsl#30
+	eor	r10,r10,r6,lsl#30
+	eor	r9,r9,r6,lsr#7
+	eor	r10,r10,r5,lsr#7
+	eor	r9,r9,r5,lsl#25
+	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
+	adds	r3,r3,r9
+	and	r9,r5,r11
+	adc	r4,r4,r10		@ T += Sigma0(a)
+
+	ldr	r10,[sp,#8+4]	@ b.hi
+	orr	r5,r5,r11
+	ldr	r11,[sp,#16+4]	@ c.hi
+	and	r5,r5,r12
+	and	r12,r6,r10
+	orr	r6,r6,r10
+	orr	r5,r5,r9		@ Maj(a,b,c).lo
+	and	r6,r6,r11
+	adds	r5,r5,r3
+	orr	r6,r6,r12		@ Maj(a,b,c).hi
+	sub	sp,sp,#8
+	adc	r6,r6,r4		@ h += T
+	tst	r14,#1
+	add	r14,r14,#8
+#if __ARM_ARCH__>=7
+	ittt	eq			@ Thumb2 thing, sanity check in ARM
+#endif
+	ldreq	r9,[sp,#184+0]
+	ldreq	r10,[sp,#184+4]
+	beq	.L16_79
+	bic	r14,r14,#1
+
+	ldr	r3,[sp,#8+0]
+	ldr	r4,[sp,#8+4]
+	ldr	r9, [r0,#0+LO]
+	ldr	r10, [r0,#0+HI]
+	ldr	r11, [r0,#8+LO]
+	ldr	r12, [r0,#8+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#0+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#0+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#8+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#8+HI]
+
+	ldr	r5,[sp,#16+0]
+	ldr	r6,[sp,#16+4]
+	ldr	r3,[sp,#24+0]
+	ldr	r4,[sp,#24+4]
+	ldr	r9, [r0,#16+LO]
+	ldr	r10, [r0,#16+HI]
+	ldr	r11, [r0,#24+LO]
+	ldr	r12, [r0,#24+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#16+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#16+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#24+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#24+HI]
+
+	ldr	r3,[sp,#40+0]
+	ldr	r4,[sp,#40+4]
+	ldr	r9, [r0,#32+LO]
+	ldr	r10, [r0,#32+HI]
+	ldr	r11, [r0,#40+LO]
+	ldr	r12, [r0,#40+HI]
+	adds	r7,r7,r9
+	str	r7,[r0,#32+LO]
+	adc	r8,r8,r10
+	str	r8,[r0,#32+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#40+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#40+HI]
+
+	ldr	r5,[sp,#48+0]
+	ldr	r6,[sp,#48+4]
+	ldr	r3,[sp,#56+0]
+	ldr	r4,[sp,#56+4]
+	ldr	r9, [r0,#48+LO]
+	ldr	r10, [r0,#48+HI]
+	ldr	r11, [r0,#56+LO]
+	ldr	r12, [r0,#56+HI]
+	adds	r9,r5,r9
+	str	r9, [r0,#48+LO]
+	adc	r10,r6,r10
+	str	r10, [r0,#48+HI]
+	adds	r11,r3,r11
+	str	r11, [r0,#56+LO]
+	adc	r12,r4,r12
+	str	r12, [r0,#56+HI]
+
+	add	sp,sp,#640
+	sub	r14,r14,#640
+
+	teq	r1,r2
+	bne	.Loop
+
+	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
+	ldmia	sp!,{r4-r12,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
+#endif
+.size	sha512_block_data_order,.-sha512_block_data_order
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.global	sha512_block_data_order_neon
+.type	sha512_block_data_order_neon,%function
+.align	4
+sha512_block_data_order_neon:
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
+	VFP_ABI_PUSH
+	adrl	r3,K512
+	vldmia	r0,{d16-d23}		@ load context
+.Loop_neon:
+	vshr.u64	d24,d20,#14	@ 0
+#if 0<16
+	vld1.64		{d0},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 0>0
+	 vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vmov		d29,d20
+	vsli.64		d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+	vrev64.8	d0,d0
+#endif
+	veor		d25,d24
+	vbsl		d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64		d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64		d25,d16,#30
+	veor		d30,d16,d17
+	vsli.64		d26,d16,#25
+	veor		d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d18,d17		@ Maj(a,b,c)
+	veor		d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 1
+#if 1<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 1>0
+	 vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vmov		d29,d19
+	vsli.64		d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+	vrev64.8	d1,d1
+#endif
+	veor		d25,d24
+	vbsl		d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64		d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64		d25,d23,#30
+	veor		d30,d23,d16
+	vsli.64		d26,d23,#25
+	veor		d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d17,d16		@ Maj(a,b,c)
+	veor		d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 2
+#if 2<16
+	vld1.64		{d2},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 2>0
+	 vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vmov		d29,d18
+	vsli.64		d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+	vrev64.8	d2,d2
+#endif
+	veor		d25,d24
+	vbsl		d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64		d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64		d25,d22,#30
+	veor		d30,d22,d23
+	vsli.64		d26,d22,#25
+	veor		d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d16,d23		@ Maj(a,b,c)
+	veor		d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 3
+#if 3<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 3>0
+	 vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vmov		d29,d17
+	vsli.64		d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+	vrev64.8	d3,d3
+#endif
+	veor		d25,d24
+	vbsl		d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64		d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64		d25,d21,#30
+	veor		d30,d21,d22
+	vsli.64		d26,d21,#25
+	veor		d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d23,d22		@ Maj(a,b,c)
+	veor		d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 4
+#if 4<16
+	vld1.64		{d4},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 4>0
+	 vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vmov		d29,d16
+	vsli.64		d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+	vrev64.8	d4,d4
+#endif
+	veor		d25,d24
+	vbsl		d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64		d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64		d25,d20,#30
+	veor		d30,d20,d21
+	vsli.64		d26,d20,#25
+	veor		d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d22,d21		@ Maj(a,b,c)
+	veor		d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 5
+#if 5<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 5>0
+	 vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vmov		d29,d23
+	vsli.64		d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+	vrev64.8	d5,d5
+#endif
+	veor		d25,d24
+	vbsl		d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64		d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64		d25,d19,#30
+	veor		d30,d19,d20
+	vsli.64		d26,d19,#25
+	veor		d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d21,d20		@ Maj(a,b,c)
+	veor		d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 6
+#if 6<16
+	vld1.64		{d6},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 6>0
+	 vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vmov		d29,d22
+	vsli.64		d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+	vrev64.8	d6,d6
+#endif
+	veor		d25,d24
+	vbsl		d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64		d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64		d25,d18,#30
+	veor		d30,d18,d19
+	vsli.64		d26,d18,#25
+	veor		d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d20,d19		@ Maj(a,b,c)
+	veor		d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 7
+#if 7<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 7>0
+	 vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vmov		d29,d21
+	vsli.64		d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+	vrev64.8	d7,d7
+#endif
+	veor		d25,d24
+	vbsl		d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64		d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64		d25,d17,#30
+	veor		d30,d17,d18
+	vsli.64		d26,d17,#25
+	veor		d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d19,d18		@ Maj(a,b,c)
+	veor		d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	d24,d20,#14	@ 8
+#if 8<16
+	vld1.64		{d8},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d20,#18
+#if 8>0
+	 vadd.i64	d16,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d20,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vmov		d29,d20
+	vsli.64		d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+	vrev64.8	d8,d8
+#endif
+	veor		d25,d24
+	vbsl		d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64		d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64		d25,d16,#30
+	veor		d30,d16,d17
+	vsli.64		d26,d16,#25
+	veor		d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d18,d17		@ Maj(a,b,c)
+	veor		d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 9
+#if 9<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 9>0
+	 vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vmov		d29,d19
+	vsli.64		d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+	vrev64.8	d9,d9
+#endif
+	veor		d25,d24
+	vbsl		d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64		d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64		d25,d23,#30
+	veor		d30,d23,d16
+	vsli.64		d26,d23,#25
+	veor		d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d17,d16		@ Maj(a,b,c)
+	veor		d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	d24,d18,#14	@ 10
+#if 10<16
+	vld1.64		{d10},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d18,#18
+#if 10>0
+	 vadd.i64	d22,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d18,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vmov		d29,d18
+	vsli.64		d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+	vrev64.8	d10,d10
+#endif
+	veor		d25,d24
+	vbsl		d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64		d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64		d25,d22,#30
+	veor		d30,d22,d23
+	vsli.64		d26,d22,#25
+	veor		d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d16,d23		@ Maj(a,b,c)
+	veor		d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 11
+#if 11<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 11>0
+	 vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vmov		d29,d17
+	vsli.64		d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+	vrev64.8	d11,d11
+#endif
+	veor		d25,d24
+	vbsl		d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64		d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64		d25,d21,#30
+	veor		d30,d21,d22
+	vsli.64		d26,d21,#25
+	veor		d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d23,d22		@ Maj(a,b,c)
+	veor		d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	d24,d16,#14	@ 12
+#if 12<16
+	vld1.64		{d12},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d16,#18
+#if 12>0
+	 vadd.i64	d20,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d16,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vmov		d29,d16
+	vsli.64		d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+	vrev64.8	d12,d12
+#endif
+	veor		d25,d24
+	vbsl		d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64		d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64		d25,d20,#30
+	veor		d30,d20,d21
+	vsli.64		d26,d20,#25
+	veor		d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d22,d21		@ Maj(a,b,c)
+	veor		d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 13
+#if 13<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 13>0
+	 vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vmov		d29,d23
+	vsli.64		d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+	vrev64.8	d13,d13
+#endif
+	veor		d25,d24
+	vbsl		d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64		d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64		d25,d19,#30
+	veor		d30,d19,d20
+	vsli.64		d26,d19,#25
+	veor		d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d21,d20		@ Maj(a,b,c)
+	veor		d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	d24,d22,#14	@ 14
+#if 14<16
+	vld1.64		{d14},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d22,#18
+#if 14>0
+	 vadd.i64	d18,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d22,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vmov		d29,d22
+	vsli.64		d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+	vrev64.8	d14,d14
+#endif
+	veor		d25,d24
+	vbsl		d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64		d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64		d25,d18,#30
+	veor		d30,d18,d19
+	vsli.64		d26,d18,#25
+	veor		d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d20,d19		@ Maj(a,b,c)
+	veor		d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 15
+#if 15<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 15>0
+	 vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vmov		d29,d21
+	vsli.64		d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+	vrev64.8	d15,d15
+#endif
+	veor		d25,d24
+	vbsl		d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64		d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64		d25,d17,#30
+	veor		d30,d17,d18
+	vsli.64		d26,d17,#25
+	veor		d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d19,d18		@ Maj(a,b,c)
+	veor		d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	mov		r12,#4
+.L16_79_neon:
+	subs		r12,#1
+	vshr.u64	q12,q7,#19
+	vshr.u64	q13,q7,#61
+	 vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q7,#6
+	vsli.64		q12,q7,#45
+	vext.8		q14,q0,q1,#8	@ X[i+1]
+	vsli.64		q13,q7,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q0,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q4,q5,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q0,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q0,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vmov		d29,d20
+	vsli.64		d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64		d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d0
+	vsli.64		d25,d16,#30
+	veor		d30,d16,d17
+	vsli.64		d26,d16,#25
+	veor		d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d18,d17		@ Maj(a,b,c)
+	veor		d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 17
+#if 17<16
+	vld1.64		{d1},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 17>0
+	 vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vmov		d29,d19
+	vsli.64		d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64		d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d1
+	vsli.64		d25,d23,#30
+	veor		d30,d23,d16
+	vsli.64		d26,d23,#25
+	veor		d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d17,d16		@ Maj(a,b,c)
+	veor		d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q0,#19
+	vshr.u64	q13,q0,#61
+	 vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q0,#6
+	vsli.64		q12,q0,#45
+	vext.8		q14,q1,q2,#8	@ X[i+1]
+	vsli.64		q13,q0,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q1,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q5,q6,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q1,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q1,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vmov		d29,d18
+	vsli.64		d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64		d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d2
+	vsli.64		d25,d22,#30
+	veor		d30,d22,d23
+	vsli.64		d26,d22,#25
+	veor		d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d16,d23		@ Maj(a,b,c)
+	veor		d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 19
+#if 19<16
+	vld1.64		{d3},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 19>0
+	 vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vmov		d29,d17
+	vsli.64		d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64		d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d3
+	vsli.64		d25,d21,#30
+	veor		d30,d21,d22
+	vsli.64		d26,d21,#25
+	veor		d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d23,d22		@ Maj(a,b,c)
+	veor		d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q1,#19
+	vshr.u64	q13,q1,#61
+	 vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q1,#6
+	vsli.64		q12,q1,#45
+	vext.8		q14,q2,q3,#8	@ X[i+1]
+	vsli.64		q13,q1,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q2,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q6,q7,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q2,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q2,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vmov		d29,d16
+	vsli.64		d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64		d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d4
+	vsli.64		d25,d20,#30
+	veor		d30,d20,d21
+	vsli.64		d26,d20,#25
+	veor		d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d22,d21		@ Maj(a,b,c)
+	veor		d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 21
+#if 21<16
+	vld1.64		{d5},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 21>0
+	 vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vmov		d29,d23
+	vsli.64		d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64		d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d5
+	vsli.64		d25,d19,#30
+	veor		d30,d19,d20
+	vsli.64		d26,d19,#25
+	veor		d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d21,d20		@ Maj(a,b,c)
+	veor		d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q2,#19
+	vshr.u64	q13,q2,#61
+	 vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q2,#6
+	vsli.64		q12,q2,#45
+	vext.8		q14,q3,q4,#8	@ X[i+1]
+	vsli.64		q13,q2,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q3,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q7,q0,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q3,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q3,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vmov		d29,d22
+	vsli.64		d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64		d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d6
+	vsli.64		d25,d18,#30
+	veor		d30,d18,d19
+	vsli.64		d26,d18,#25
+	veor		d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d20,d19		@ Maj(a,b,c)
+	veor		d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 23
+#if 23<16
+	vld1.64		{d7},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 23>0
+	 vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vmov		d29,d21
+	vsli.64		d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64		d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d7
+	vsli.64		d25,d17,#30
+	veor		d30,d17,d18
+	vsli.64		d26,d17,#25
+	veor		d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d19,d18		@ Maj(a,b,c)
+	veor		d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	vshr.u64	q12,q3,#19
+	vshr.u64	q13,q3,#61
+	 vadd.i64	d16,d30			@ h+=Maj from the past
+	vshr.u64	q15,q3,#6
+	vsli.64		q12,q3,#45
+	vext.8		q14,q4,q5,#8	@ X[i+1]
+	vsli.64		q13,q3,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q4,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q0,q1,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d20,#14		@ from NEON_00_15
+	vadd.i64	q4,q14
+	vshr.u64	d25,d20,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d20,#41		@ from NEON_00_15
+	vadd.i64	q4,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d20,#50
+	vsli.64		d25,d20,#46
+	vmov		d29,d20
+	vsli.64		d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d21,d22		@ Ch(e,f,g)
+	vshr.u64	d24,d16,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d23
+	vshr.u64	d25,d16,#34
+	vsli.64		d24,d16,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d16,#39
+	vadd.i64	d28,d8
+	vsli.64		d25,d16,#30
+	veor		d30,d16,d17
+	vsli.64		d26,d16,#25
+	veor		d23,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d18,d17		@ Maj(a,b,c)
+	veor		d23,d26			@ Sigma0(a)
+	vadd.i64	d19,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d23,d30
+	vshr.u64	d24,d19,#14	@ 25
+#if 25<16
+	vld1.64		{d9},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d19,#18
+#if 25>0
+	 vadd.i64	d23,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d19,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d19,#50
+	vsli.64		d25,d19,#46
+	vmov		d29,d19
+	vsli.64		d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d20,d21		@ Ch(e,f,g)
+	vshr.u64	d24,d23,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d22
+	vshr.u64	d25,d23,#34
+	vsli.64		d24,d23,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d23,#39
+	vadd.i64	d28,d9
+	vsli.64		d25,d23,#30
+	veor		d30,d23,d16
+	vsli.64		d26,d23,#25
+	veor		d22,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d17,d16		@ Maj(a,b,c)
+	veor		d22,d26			@ Sigma0(a)
+	vadd.i64	d18,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d22,d30
+	vshr.u64	q12,q4,#19
+	vshr.u64	q13,q4,#61
+	 vadd.i64	d22,d30			@ h+=Maj from the past
+	vshr.u64	q15,q4,#6
+	vsli.64		q12,q4,#45
+	vext.8		q14,q5,q6,#8	@ X[i+1]
+	vsli.64		q13,q4,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q5,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q1,q2,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d18,#14		@ from NEON_00_15
+	vadd.i64	q5,q14
+	vshr.u64	d25,d18,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d18,#41		@ from NEON_00_15
+	vadd.i64	q5,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d18,#50
+	vsli.64		d25,d18,#46
+	vmov		d29,d18
+	vsli.64		d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d19,d20		@ Ch(e,f,g)
+	vshr.u64	d24,d22,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d21
+	vshr.u64	d25,d22,#34
+	vsli.64		d24,d22,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d22,#39
+	vadd.i64	d28,d10
+	vsli.64		d25,d22,#30
+	veor		d30,d22,d23
+	vsli.64		d26,d22,#25
+	veor		d21,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d16,d23		@ Maj(a,b,c)
+	veor		d21,d26			@ Sigma0(a)
+	vadd.i64	d17,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d21,d30
+	vshr.u64	d24,d17,#14	@ 27
+#if 27<16
+	vld1.64		{d11},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d17,#18
+#if 27>0
+	 vadd.i64	d21,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d17,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d17,#50
+	vsli.64		d25,d17,#46
+	vmov		d29,d17
+	vsli.64		d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d18,d19		@ Ch(e,f,g)
+	vshr.u64	d24,d21,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d20
+	vshr.u64	d25,d21,#34
+	vsli.64		d24,d21,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d21,#39
+	vadd.i64	d28,d11
+	vsli.64		d25,d21,#30
+	veor		d30,d21,d22
+	vsli.64		d26,d21,#25
+	veor		d20,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d23,d22		@ Maj(a,b,c)
+	veor		d20,d26			@ Sigma0(a)
+	vadd.i64	d16,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d20,d30
+	vshr.u64	q12,q5,#19
+	vshr.u64	q13,q5,#61
+	 vadd.i64	d20,d30			@ h+=Maj from the past
+	vshr.u64	q15,q5,#6
+	vsli.64		q12,q5,#45
+	vext.8		q14,q6,q7,#8	@ X[i+1]
+	vsli.64		q13,q5,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q6,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q2,q3,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d16,#14		@ from NEON_00_15
+	vadd.i64	q6,q14
+	vshr.u64	d25,d16,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d16,#41		@ from NEON_00_15
+	vadd.i64	q6,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d16,#50
+	vsli.64		d25,d16,#46
+	vmov		d29,d16
+	vsli.64		d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d17,d18		@ Ch(e,f,g)
+	vshr.u64	d24,d20,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d19
+	vshr.u64	d25,d20,#34
+	vsli.64		d24,d20,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d20,#39
+	vadd.i64	d28,d12
+	vsli.64		d25,d20,#30
+	veor		d30,d20,d21
+	vsli.64		d26,d20,#25
+	veor		d19,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d22,d21		@ Maj(a,b,c)
+	veor		d19,d26			@ Sigma0(a)
+	vadd.i64	d23,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d19,d30
+	vshr.u64	d24,d23,#14	@ 29
+#if 29<16
+	vld1.64		{d13},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d23,#18
+#if 29>0
+	 vadd.i64	d19,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d23,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d23,#50
+	vsli.64		d25,d23,#46
+	vmov		d29,d23
+	vsli.64		d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d16,d17		@ Ch(e,f,g)
+	vshr.u64	d24,d19,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d18
+	vshr.u64	d25,d19,#34
+	vsli.64		d24,d19,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d19,#39
+	vadd.i64	d28,d13
+	vsli.64		d25,d19,#30
+	veor		d30,d19,d20
+	vsli.64		d26,d19,#25
+	veor		d18,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d21,d20		@ Maj(a,b,c)
+	veor		d18,d26			@ Sigma0(a)
+	vadd.i64	d22,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d18,d30
+	vshr.u64	q12,q6,#19
+	vshr.u64	q13,q6,#61
+	 vadd.i64	d18,d30			@ h+=Maj from the past
+	vshr.u64	q15,q6,#6
+	vsli.64		q12,q6,#45
+	vext.8		q14,q7,q0,#8	@ X[i+1]
+	vsli.64		q13,q6,#3
+	veor		q15,q12
+	vshr.u64	q12,q14,#1
+	veor		q15,q13				@ sigma1(X[i+14])
+	vshr.u64	q13,q14,#8
+	vadd.i64	q7,q15
+	vshr.u64	q15,q14,#7
+	vsli.64		q12,q14,#63
+	vsli.64		q13,q14,#56
+	vext.8		q14,q3,q4,#8	@ X[i+9]
+	veor		q15,q12
+	vshr.u64	d24,d22,#14		@ from NEON_00_15
+	vadd.i64	q7,q14
+	vshr.u64	d25,d22,#18		@ from NEON_00_15
+	veor		q15,q13				@ sigma0(X[i+1])
+	vshr.u64	d26,d22,#41		@ from NEON_00_15
+	vadd.i64	q7,q15
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d22,#50
+	vsli.64		d25,d22,#46
+	vmov		d29,d22
+	vsli.64		d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d23,d16		@ Ch(e,f,g)
+	vshr.u64	d24,d18,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d17
+	vshr.u64	d25,d18,#34
+	vsli.64		d24,d18,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d18,#39
+	vadd.i64	d28,d14
+	vsli.64		d25,d18,#30
+	veor		d30,d18,d19
+	vsli.64		d26,d18,#25
+	veor		d17,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d20,d19		@ Maj(a,b,c)
+	veor		d17,d26			@ Sigma0(a)
+	vadd.i64	d21,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d17,d30
+	vshr.u64	d24,d21,#14	@ 31
+#if 31<16
+	vld1.64		{d15},[r1]!	@ handles unaligned
+#endif
+	vshr.u64	d25,d21,#18
+#if 31>0
+	 vadd.i64	d17,d30			@ h+=Maj from the past
+#endif
+	vshr.u64	d26,d21,#41
+	vld1.64		{d28},[r3,:64]!	@ K[i++]
+	vsli.64		d24,d21,#50
+	vsli.64		d25,d21,#46
+	vmov		d29,d21
+	vsli.64		d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+	vrev64.8	,
+#endif
+	veor		d25,d24
+	vbsl		d29,d22,d23		@ Ch(e,f,g)
+	vshr.u64	d24,d17,#28
+	veor		d26,d25			@ Sigma1(e)
+	vadd.i64	d27,d29,d16
+	vshr.u64	d25,d17,#34
+	vsli.64		d24,d17,#36
+	vadd.i64	d27,d26
+	vshr.u64	d26,d17,#39
+	vadd.i64	d28,d15
+	vsli.64		d25,d17,#30
+	veor		d30,d17,d18
+	vsli.64		d26,d17,#25
+	veor		d16,d24,d25
+	vadd.i64	d27,d28
+	vbsl		d30,d19,d18		@ Maj(a,b,c)
+	veor		d16,d26			@ Sigma0(a)
+	vadd.i64	d20,d27
+	vadd.i64	d30,d27
+	@ vadd.i64	d16,d30
+	bne		.L16_79_neon
+
+	 vadd.i64	d16,d30		@ h+=Maj from the past
+	vldmia		r0,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		r0,{d16-d23}	@ save context
+	teq		r1,r2
+	sub		r3,#640	@ rewind K512
+	bne		.Loop_neon
+
+	VFP_ABI_POP
+	bx	lr				@ .word	0xe12fff1e
+.size	sha512_block_data_order_neon,.-sha512_block_data_order_neon
+#endif
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
+.comm	OPENSSL_armcap_P,4,4
+#endif
diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c
new file mode 100644
index 000000000000..269a394e4a53
--- /dev/null
+++ b/arch/arm/crypto/sha512-glue.c
@@ -0,0 +1,121 @@
+/*
+ * sha512-glue.c - accelerated SHA-384/512 for ARM
+ *
+ * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+
+#include "sha512.h"
+
+MODULE_DESCRIPTION("Accelerated SHA-384/SHA-512 secure hash for ARM");
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+
+MODULE_ALIAS_CRYPTO("sha384");
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha384-arm");
+MODULE_ALIAS_CRYPTO("sha512-arm");
+
+asmlinkage void sha512_block_data_order(u64 *state, u8 const *src, int blocks);
+
+int sha512_arm_update(struct shash_desc *desc, const u8 *data,
+		      unsigned int len)
+{
+	return sha512_base_do_update(desc, data, len,
+		(sha512_block_fn *)sha512_block_data_order);
+}
+
+int sha512_arm_final(struct shash_desc *desc, u8 *out)
+{
+	sha512_base_do_finalize(desc,
+		(sha512_block_fn *)sha512_block_data_order);
+	return sha512_base_finish(desc, out);
+}
+
+int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
+		     unsigned int len, u8 *out)
+{
+	sha512_base_do_update(desc, data, len,
+		(sha512_block_fn *)sha512_block_data_order);
+	return sha512_arm_final(desc, out);
+}
+
+static struct shash_alg sha512_arm_algs[] = { {
+	.init			= sha384_base_init,
+	.update			= sha512_arm_update,
+	.final			= sha512_arm_final,
+	.finup			= sha512_arm_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA384_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha384",
+		.cra_driver_name	= "sha384-arm",
+		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+},  {
+	.init			= sha512_base_init,
+	.update			= sha512_arm_update,
+	.final			= sha512_arm_final,
+	.finup			= sha512_arm_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA512_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha512",
+		.cra_driver_name	= "sha512-arm",
+		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+} };
+
+static int __init sha512_arm_mod_init(void)
+{
+	int err;
+
+	err = crypto_register_shashes(sha512_arm_algs,
+				      ARRAY_SIZE(sha512_arm_algs));
+	if (err)
+		return err;
+
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon()) {
+		err = crypto_register_shashes(sha512_neon_algs,
+					      ARRAY_SIZE(sha512_neon_algs));
+		if (err)
+			goto err_unregister;
+	}
+	return 0;
+
+err_unregister:
+	crypto_unregister_shashes(sha512_arm_algs,
+				  ARRAY_SIZE(sha512_arm_algs));
+
+	return err;
+}
+
+static void __exit sha512_arm_mod_fini(void)
+{
+	crypto_unregister_shashes(sha512_arm_algs,
+				  ARRAY_SIZE(sha512_arm_algs));
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && cpu_has_neon())
+		crypto_unregister_shashes(sha512_neon_algs,
+					  ARRAY_SIZE(sha512_neon_algs));
+}
+
+module_init(sha512_arm_mod_init);
+module_exit(sha512_arm_mod_fini);
diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c
new file mode 100644
index 000000000000..32693684a3ab
--- /dev/null
+++ b/arch/arm/crypto/sha512-neon-glue.c
@@ -0,0 +1,98 @@
+/*
+ * sha512-neon-glue.c - accelerated SHA-384/512 for ARM NEON
+ *
+ * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/sha.h>
+#include <crypto/sha512_base.h>
+#include <linux/crypto.h>
+#include <linux/module.h>
+
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+#include "sha512.h"
+
+MODULE_ALIAS_CRYPTO("sha384-neon");
+MODULE_ALIAS_CRYPTO("sha512-neon");
+
+asmlinkage void sha512_block_data_order_neon(u64 *state, u8 const *src,
+					     int blocks);
+
+static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	if (!may_use_simd() ||
+	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+		return sha512_arm_update(desc, data, len);
+
+	kernel_neon_begin();
+	sha512_base_do_update(desc, data, len,
+		(sha512_block_fn *)sha512_block_data_order_neon);
+	kernel_neon_end();
+
+	return 0;
+}
+
+static int sha512_neon_finup(struct shash_desc *desc, const u8 *data,
+			     unsigned int len, u8 *out)
+{
+	if (!may_use_simd())
+		return sha512_arm_finup(desc, data, len, out);
+
+	kernel_neon_begin();
+	if (len)
+		sha512_base_do_update(desc, data, len,
+			(sha512_block_fn *)sha512_block_data_order_neon);
+	sha512_base_do_finalize(desc,
+		(sha512_block_fn *)sha512_block_data_order_neon);
+	kernel_neon_end();
+
+	return sha512_base_finish(desc, out);
+}
+
+static int sha512_neon_final(struct shash_desc *desc, u8 *out)
+{
+	return sha512_neon_finup(desc, NULL, 0, out);
+}
+
+struct shash_alg sha512_neon_algs[] = { {
+	.init			= sha384_base_init,
+	.update			= sha512_neon_update,
+	.final			= sha512_neon_final,
+	.finup			= sha512_neon_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA384_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha384",
+		.cra_driver_name	= "sha384-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA384_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+
+	}
+},  {
+	.init			= sha512_base_init,
+	.update			= sha512_neon_update,
+	.final			= sha512_neon_final,
+	.finup			= sha512_neon_finup,
+	.descsize		= sizeof(struct sha512_state),
+	.digestsize		= SHA512_DIGEST_SIZE,
+	.base			= {
+		.cra_name		= "sha512",
+		.cra_driver_name	= "sha512-neon",
+		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA512_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+} };
diff --git a/arch/arm/crypto/sha512.h b/arch/arm/crypto/sha512.h
new file mode 100644
index 000000000000..a75d9a82988a
--- /dev/null
+++ b/arch/arm/crypto/sha512.h
@@ -0,0 +1,8 @@
+
+int sha512_arm_update(struct shash_desc *desc, const u8 *data,
+		      unsigned int len);
+
+int sha512_arm_finup(struct shash_desc *desc, const u8 *data,
+		     unsigned int len, u8 *out);
+
+extern struct shash_alg sha512_neon_algs[2];
diff --git a/arch/arm/crypto/sha512_neon_glue.c b/arch/arm/crypto/sha512_neon_glue.c
deleted file mode 100644
index b124dce838d6..000000000000
--- a/arch/arm/crypto/sha512_neon_glue.c
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
- * Glue code for the SHA512 Secure Hash Algorithm assembly implementation
- * using NEON instructions.
- *
- * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
- *
- * This file is based on sha512_ssse3_glue.c:
- *   Copyright (C) 2013 Intel Corporation
- *   Author: Tim Chen <tim.c.chen@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- */
-
-#include <crypto/internal/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/cryptohash.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <crypto/sha.h>
-#include <asm/byteorder.h>
-#include <asm/simd.h>
-#include <asm/neon.h>
-
-
-static const u64 sha512_k[] = {
-	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
-	0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
-	0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
-	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
-	0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
-	0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
-	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
-	0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
-	0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
-	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
-	0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
-	0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
-	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
-	0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
-	0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
-	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
-	0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
-	0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
-	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
-	0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
-	0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
-	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
-	0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
-	0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
-	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
-	0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
-	0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
-	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
-	0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
-	0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
-	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
-	0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
-	0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
-	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
-	0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
-	0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
-	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
-	0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
-	0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
-	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
-};
-
-
-asmlinkage void sha512_transform_neon(u64 *digest, const void *data,
-				      const u64 k[], unsigned int num_blks);
-
-
-static int sha512_neon_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA512_H0;
-	sctx->state[1] = SHA512_H1;
-	sctx->state[2] = SHA512_H2;
-	sctx->state[3] = SHA512_H3;
-	sctx->state[4] = SHA512_H4;
-	sctx->state[5] = SHA512_H5;
-	sctx->state[6] = SHA512_H6;
-	sctx->state[7] = SHA512_H7;
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static int __sha512_neon_update(struct shash_desc *desc, const u8 *data,
-				unsigned int len, unsigned int partial)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count[0] += len;
-	if (sctx->count[0] < len)
-		sctx->count[1]++;
-
-	if (partial) {
-		done = SHA512_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha512_transform_neon(sctx->state, sctx->buf, sha512_k, 1);
-	}
-
-	if (len - done >= SHA512_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
-
-		sha512_transform_neon(sctx->state, data + done, sha512_k,
-				      rounds);
-
-		done += rounds * SHA512_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buf, data + done, len - done);
-
-	return 0;
-}
-
-static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
-	int res;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA512_BLOCK_SIZE) {
-		sctx->count[0] += len;
-		if (sctx->count[0] < len)
-			sctx->count[1]++;
-		memcpy(sctx->buf + partial, data, len);
-
-		return 0;
-	}
-
-	if (!may_use_simd()) {
-		res = crypto_sha512_update(desc, data, len);
-	} else {
-		kernel_neon_begin();
-		res = __sha512_neon_update(desc, data, len, partial);
-		kernel_neon_end();
-	}
-
-	return res;
-}
-
-
-/* Add padding and return the message digest. */
-static int sha512_neon_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be64 *dst = (__be64 *)out;
-	__be64 bits[2];
-	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
-
-	/* save number of bits */
-	bits[1] = cpu_to_be64(sctx->count[0] << 3);
-	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
-	/* Pad out to 112 mod 128 and append length */
-	index = sctx->count[0] & 0x7f;
-	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
-
-	if (!may_use_simd()) {
-		crypto_sha512_update(desc, padding, padlen);
-		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_neon_begin();
-		/* We need to fill a whole block for __sha512_neon_update() */
-		if (padlen <= 112) {
-			sctx->count[0] += padlen;
-			if (sctx->count[0] < padlen)
-				sctx->count[1]++;
-			memcpy(sctx->buf + index, padding, padlen);
-		} else {
-			__sha512_neon_update(desc, padding, padlen, index);
-		}
-		__sha512_neon_update(desc, (const u8 *)&bits,
-					sizeof(bits), 112);
-		kernel_neon_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be64(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_neon_export(struct shash_desc *desc, void *out)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_neon_import(struct shash_desc *desc, const void *in)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha384_neon_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA384_H0;
-	sctx->state[1] = SHA384_H1;
-	sctx->state[2] = SHA384_H2;
-	sctx->state[3] = SHA384_H3;
-	sctx->state[4] = SHA384_H4;
-	sctx->state[5] = SHA384_H5;
-	sctx->state[6] = SHA384_H6;
-	sctx->state[7] = SHA384_H7;
-
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static int sha384_neon_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[SHA512_DIGEST_SIZE];
-
-	sha512_neon_final(desc, D);
-
-	memcpy(hash, D, SHA384_DIGEST_SIZE);
-	memzero_explicit(D, SHA512_DIGEST_SIZE);
-
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize	=	SHA512_DIGEST_SIZE,
-	.init		=	sha512_neon_init,
-	.update		=	sha512_neon_update,
-	.final		=	sha512_neon_final,
-	.export		=	sha512_neon_export,
-	.import		=	sha512_neon_import,
-	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
-	.base		=	{
-		.cra_name	=	"sha512",
-		.cra_driver_name =	"sha512-neon",
-		.cra_priority	=	250,
-		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
-		.cra_blocksize	=	SHA512_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-},  {
-	.digestsize	=	SHA384_DIGEST_SIZE,
-	.init		=	sha384_neon_init,
-	.update		=	sha512_neon_update,
-	.final		=	sha384_neon_final,
-	.export		=	sha512_neon_export,
-	.import		=	sha512_neon_import,
-	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
-	.base		=	{
-		.cra_name	=	"sha384",
-		.cra_driver_name =	"sha384-neon",
-		.cra_priority	=	250,
-		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
-		.cra_blocksize	=	SHA384_BLOCK_SIZE,
-		.cra_module	=	THIS_MODULE,
-	}
-} };
-
-static int __init sha512_neon_mod_init(void)
-{
-	if (!cpu_has_neon())
-		return -ENODEV;
-
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_neon_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_init(sha512_neon_mod_init);
-module_exit(sha512_neon_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, NEON accelerated");
-
-MODULE_ALIAS_CRYPTO("sha512");
-MODULE_ALIAS_CRYPTO("sha384");
diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h
index d2f81e6b8c1c..6c2327e1c732 100644
--- a/arch/arm/include/asm/barrier.h
+++ b/arch/arm/include/asm/barrier.h
@@ -81,7 +81,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_mb__before_atomic()	smp_mb()
 #define smp_mb__after_atomic()	smp_mb()
diff --git a/arch/arm/include/asm/edac.h b/arch/arm/include/asm/edac.h
index 0df7a2c1fc3d..5189fa819b60 100644
--- a/arch/arm/include/asm/edac.h
+++ b/arch/arm/include/asm/edac.h
@@ -18,11 +18,12 @@
 #define ASM_EDAC_H
 /*
  * ECC atomic, DMA, SMP and interrupt safe scrub function.
- * Implements the per arch atomic_scrub() that EDAC use for software
+ * Implements the per arch edac_atomic_scrub() that EDAC use for software
  * ECC scrubbing.  It reads memory and then writes back the original
  * value, allowing the hardware to detect and correct memory errors.
  */
-static inline void atomic_scrub(void *va, u32 size)
+
+static inline void edac_atomic_scrub(void *va, u32 size)
 {
 #if __LINUX_ARM_ARCH__ >= 6
 	unsigned int *virt_addr = va;
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 4e78065a16aa..5eed82809d82 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -93,6 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
+	preempt_disable();
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	" TUSER(ldr) "	%1, [%4]\n"
 	"	teq	%1, %2\n"
@@ -104,6 +105,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 	: "cc", "memory");
 
 	*uval = val;
+	preempt_enable();
+
 	return ret;
 }
 
@@ -124,7 +127,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+#ifndef CONFIG_SMP
+	preempt_disable();
+#endif
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -146,7 +152,10 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
+#ifndef CONFIG_SMP
+	preempt_enable();
+#endif
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index db58deb00aa7..1b7677d1e5e1 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -336,6 +336,7 @@ extern void _memset_io(volatile void __iomem *, int, size_t);
 #define ioremap_nocache(cookie,size)	__arm_ioremap((cookie), (size), MT_DEVICE)
 #define ioremap_cache(cookie,size)	__arm_ioremap((cookie), (size), MT_DEVICE_CACHED)
 #define ioremap_wc(cookie,size)		__arm_ioremap((cookie), (size), MT_DEVICE_WC)
+#define ioremap_wt(cookie,size)		__arm_ioremap((cookie), (size), MT_DEVICE)
 #define iounmap				__arm_iounmap
 
 /*
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 25410b2d8bc1..194c91b610ff 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -23,7 +23,7 @@
 #define c0_MPIDR	1	/* MultiProcessor ID Register */
 #define c0_CSSELR	2	/* Cache Size Selection Register */
 #define c1_SCTLR	3	/* System Control Register */
-#define c1_ACTLR	4	/* Auxilliary Control Register */
+#define c1_ACTLR	4	/* Auxiliary Control Register */
 #define c1_CPACR	5	/* Coprocessor Access Control */
 #define c2_TTBR0	6	/* Translation Table Base Register 0 */
 #define c2_TTBR0_high	7	/* TTBR0 top 32 bits */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index d71607c16601..e896d2c196e6 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -218,11 +218,6 @@ static inline int kvm_arch_dev_ioctl_check_extension(long ext)
 	return 0;
 }
 
-static inline void vgic_arch_setup(const struct vgic_params *vgic)
-{
-	BUG_ON(vgic->type != VGIC_V2);
-}
-
 int kvm_perf_init(void);
 int kvm_perf_teardown(void);
 
diff --git a/arch/arm/include/asm/pci.h b/arch/arm/include/asm/pci.h
index 585dc33a7a24..a5635444ca41 100644
--- a/arch/arm/include/asm/pci.h
+++ b/arch/arm/include/asm/pci.h
@@ -31,16 +31,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
  */
 #define PCI_DMA_BUS_IS_PHYS     (1)
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                                enum pci_mmap_state mmap_state, int write_combine);
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 2fe85fff5cca..370f7a732900 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cputopo_arm cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].socket_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index f8ccc21fa032..4e7f40c577e6 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -33,7 +33,9 @@ ret_fast_syscall:
  UNWIND(.fnstart	)
  UNWIND(.cantunwind	)
 	disable_irq				@ disable interrupts
-	ldr	r1, [tsk, #TI_FLAGS]
+	ldr	r1, [tsk, #TI_FLAGS]		@ re-check for syscall tracing
+	tst	r1, #_TIF_SYSCALL_WORK
+	bne	__sys_trace_return
 	tst	r1, #_TIF_WORK_MASK
 	bne	fast_work_pending
 	asm_trace_hardirqs_on
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 213919ba326f..3b8c2833c537 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -304,16 +304,17 @@ static int probe_current_pmu(struct arm_pmu *pmu)
 static int of_pmu_irq_cfg(struct platform_device *pdev)
 {
 	int i, irq;
-	int *irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
-
-	if (!irqs)
-		return -ENOMEM;
+	int *irqs;
 
 	/* Don't bother with PPIs; they're already affine */
 	irq = platform_get_irq(pdev, 0);
 	if (irq >= 0 && irq_is_percpu(irq))
 		return 0;
 
+	irqs = kcalloc(pdev->num_resources, sizeof(*irqs), GFP_KERNEL);
+	if (!irqs)
+		return -ENOMEM;
+
 	for (i = 0; i < pdev->num_resources; ++i) {
 		struct device_node *dn;
 		int cpu;
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index f1f79d104309..bfb915d05665 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select SRCU
 	select MMU_NOTIFIER
+	select KVM_VFIO
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQFD
 	depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 139e46c08b6e..c5eef02c52ba 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -15,7 +15,7 @@ AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt)
 AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
 
 KVM := ../../../virt/kvm
-kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o
+kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
 
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index d9631ecddd56..bc738d2b8392 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -171,7 +171,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	int r;
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
-	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
 	case KVM_CAP_USER_MEMORY:
@@ -532,6 +531,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		kvm_vgic_flush_hwstate(vcpu);
 		kvm_timer_flush_hwstate(vcpu);
 
+		preempt_disable();
 		local_irq_disable();
 
 		/*
@@ -544,6 +544,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
 			local_irq_enable();
+			preempt_enable();
 			kvm_timer_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			continue;
@@ -553,14 +554,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * Enter the guest
 		 */
 		trace_kvm_entry(*vcpu_pc(vcpu));
-		kvm_guest_enter();
+		__kvm_guest_enter();
 		vcpu->mode = IN_GUEST_MODE;
 
 		ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
 
 		vcpu->mode = OUTSIDE_GUEST_MODE;
-		kvm_guest_exit();
-		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+		/*
+		 * Back from guest
+		 *************************************************************/
+
 		/*
 		 * We may have taken a host interrupt in HYP mode (ie
 		 * while executing the guest). This interrupt is still
@@ -574,8 +577,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		local_irq_enable();
 
 		/*
-		 * Back from guest
-		 *************************************************************/
+		 * We do local_irq_enable() before calling kvm_guest_exit() so
+		 * that if a timer interrupt hits while running the guest we
+		 * account that tick as being spent in the guest.  We enable
+		 * preemption after calling kvm_guest_exit() so that if we get
+		 * preempted we make sure ticks after that is not counted as
+		 * guest time.
+		 */
+		kvm_guest_exit();
+		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+		preempt_enable();
+
 
 		kvm_timer_sync_hwstate(vcpu);
 		kvm_vgic_sync_hwstate(vcpu);
diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
index 79caf79b304a..f7db3a5d80e3 100644
--- a/arch/arm/kvm/interrupts.S
+++ b/arch/arm/kvm/interrupts.S
@@ -170,13 +170,9 @@ __kvm_vcpu_return:
 	@ Don't trap coprocessor accesses for host kernel
 	set_hstr vmexit
 	set_hdcr vmexit
-	set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11))
+	set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11)), after_vfp_restore
 
 #ifdef CONFIG_VFPv3
-	@ Save floating point registers we if let guest use them.
-	tst	r2, #(HCPTR_TCP(10) | HCPTR_TCP(11))
-	bne	after_vfp_restore
-
 	@ Switch VFP/NEON hardware state to the host's
 	add	r7, vcpu, #VCPU_VFP_GUEST
 	store_vfp_state r7
@@ -188,6 +184,8 @@ after_vfp_restore:
 	@ Restore FPEXC_EN which we clobbered on entry
 	pop	{r2}
 	VFPFMXR FPEXC, r2
+#else
+after_vfp_restore:
 #endif
 
 	@ Reset Hyp-role
@@ -483,7 +481,7 @@ switch_to_guest_vfp:
 	push	{r3-r7}
 
 	@ NEON/VFP used.  Turn on VFP access.
-	set_hcptr vmexit, (HCPTR_TCP(10) | HCPTR_TCP(11))
+	set_hcptr vmtrap, (HCPTR_TCP(10) | HCPTR_TCP(11))
 
 	@ Switch VFP/NEON hardware state to the guest's
 	add	r7, r0, #VCPU_VFP_HOST
diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
index 35e4a3a0c476..702740d37465 100644
--- a/arch/arm/kvm/interrupts_head.S
+++ b/arch/arm/kvm/interrupts_head.S
@@ -412,7 +412,6 @@ vcpu	.req	r0		@ vcpu pointer always in r0
 	add	r11, vcpu, #VCPU_VGIC_CPU
 
 	/* Save all interesting registers */
-	ldr	r3, [r2, #GICH_HCR]
 	ldr	r4, [r2, #GICH_VMCR]
 	ldr	r5, [r2, #GICH_MISR]
 	ldr	r6, [r2, #GICH_EISR0]
@@ -420,7 +419,6 @@ vcpu	.req	r0		@ vcpu pointer always in r0
 	ldr	r8, [r2, #GICH_ELRSR0]
 	ldr	r9, [r2, #GICH_ELRSR1]
 	ldr	r10, [r2, #GICH_APR]
-ARM_BE8(rev	r3, r3	)
 ARM_BE8(rev	r4, r4	)
 ARM_BE8(rev	r5, r5	)
 ARM_BE8(rev	r6, r6	)
@@ -429,7 +427,6 @@ ARM_BE8(rev	r8, r8	)
 ARM_BE8(rev	r9, r9	)
 ARM_BE8(rev	r10, r10	)
 
-	str	r3, [r11, #VGIC_V2_CPU_HCR]
 	str	r4, [r11, #VGIC_V2_CPU_VMCR]
 	str	r5, [r11, #VGIC_V2_CPU_MISR]
 #ifdef CONFIG_CPU_ENDIAN_BE8
@@ -591,8 +588,13 @@ ARM_BE8(rev	r6, r6  )
 .endm
 
 /* Configures the HCPTR (Hyp Coprocessor Trap Register) on entry/return
- * (hardware reset value is 0). Keep previous value in r2. */
-.macro set_hcptr operation, mask
+ * (hardware reset value is 0). Keep previous value in r2.
+ * An ISB is emited on vmexit/vmtrap, but executed on vmexit only if
+ * VFP wasn't already enabled (always executed on vmtrap).
+ * If a label is specified with vmexit, it is branched to if VFP wasn't
+ * enabled.
+ */
+.macro set_hcptr operation, mask, label = none
 	mrc	p15, 4, r2, c1, c1, 2
 	ldr	r3, =\mask
 	.if \operation == vmentry
@@ -601,6 +603,17 @@ ARM_BE8(rev	r6, r6  )
 	bic	r3, r2, r3		@ Don't trap defined coproc-accesses
 	.endif
 	mcr	p15, 4, r3, c1, c1, 2
+	.if \operation != vmentry
+	.if \operation == vmexit
+	tst	r2, #(HCPTR_TCP(10) | HCPTR_TCP(11))
+	beq	1f
+	.endif
+	isb
+	.if \label != none
+	b	\label
+	.endif
+1:
+	.endif
 .endm
 
 /* Configures the HDCR (Hyp Debug Configuration Register) on entry/return
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 1d5accbd3dcf..7b4201294187 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -691,8 +691,8 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
 		 * work.  This is not used by the hardware and we have no
 		 * alignment requirement for this allocation.
 		 */
-		pgd = (pgd_t *)kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
-				       GFP_KERNEL | __GFP_ZERO);
+		pgd = kmalloc(PTRS_PER_S2_PGD * sizeof(pgd_t),
+				GFP_KERNEL | __GFP_ZERO);
 
 		if (!pgd) {
 			kvm_free_hwpgd(hwpgd);
@@ -1155,7 +1155,8 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
  */
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
 {
-	struct kvm_memory_slot *memslot = id_to_memslot(kvm->memslots, slot);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
+	struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
 	phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
 	phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
 
@@ -1718,8 +1719,9 @@ out:
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
 	/*
@@ -1733,7 +1735,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	hva_t hva = mem->userspace_addr;
@@ -1838,7 +1840,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
 }
 
diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c
index 02fa8eff6ae1..4b94b513168d 100644
--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@@ -24,6 +24,8 @@
 #include <asm/kvm_psci.h>
 #include <asm/kvm_host.h>
 
+#include <uapi/linux/psci.h>
+
 /*
  * This is an implementation of the Power State Coordination Interface
  * as described in ARM document number ARM DEN 0022A.
@@ -230,10 +232,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 	case PSCI_0_2_FN64_AFFINITY_INFO:
 		val = kvm_psci_vcpu_affinity_info(vcpu);
 		break;
-	case PSCI_0_2_FN_MIGRATE:
-	case PSCI_0_2_FN64_MIGRATE:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
 	case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
 		/*
 		 * Trusted OS is MP hence does not require migration
@@ -242,10 +240,6 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 		 */
 		val = PSCI_0_2_TOS_MP;
 		break;
-	case PSCI_0_2_FN_MIGRATE_INFO_UP_CPU:
-	case PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU:
-		val = PSCI_RET_NOT_SUPPORTED;
-		break;
 	case PSCI_0_2_FN_SYSTEM_OFF:
 		kvm_psci_system_off(vcpu);
 		/*
@@ -271,7 +265,8 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
 		ret = 0;
 		break;
 	default:
-		return -EINVAL;
+		val = PSCI_RET_NOT_SUPPORTED;
+		break;
 	}
 
 	*vcpu_reg(vcpu, 0) = val;
@@ -291,12 +286,9 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
 	case KVM_PSCI_FN_CPU_ON:
 		val = kvm_psci_vcpu_on(vcpu);
 		break;
-	case KVM_PSCI_FN_CPU_SUSPEND:
-	case KVM_PSCI_FN_MIGRATE:
+	default:
 		val = PSCI_RET_NOT_SUPPORTED;
 		break;
-	default:
-		return -EINVAL;
 	}
 
 	*vcpu_reg(vcpu, 0) = val;
diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S
index 947567ff67f9..af2267f6a529 100644
--- a/arch/arm/lib/lib1funcs.S
+++ b/arch/arm/lib/lib1funcs.S
@@ -167,7 +167,7 @@ Boston, MA 02111-1307, USA.  */
 
 #endif
 
-	@ Perform all needed substractions to keep only the reminder.
+	@ Perform all needed subtractions to keep only the reminder.
 	@ Do comparisons in batch of 4 first.
 	subs	\order, \order, #3		@ yes, 3 is intended here
 	blt	2f
@@ -189,7 +189,7 @@ Boston, MA 02111-1307, USA.  */
 	teqne	\dividend, #0
 	beq	5f
 
-	@ Either 1, 2 or 3 comparison/substractions are left.
+	@ Either 1, 2 or 3 comparison/subtractions are left.
 2:	cmn	\order, #2
 	blt	4f
 	beq	3f
diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c
index 45ce065e7170..3b8740c083c4 100644
--- a/arch/arm/mach-davinci/da850.c
+++ b/arch/arm/mach-davinci/da850.c
@@ -11,6 +11,7 @@
  * is licensed "as is" without any warranty of any kind, whether express
  * or implied.
  */
+#include <linux/clkdev.h>
 #include <linux/gpio.h>
 #include <linux/init.h>
 #include <linux/clk.h>
diff --git a/arch/arm/mach-davinci/pm_domain.c b/arch/arm/mach-davinci/pm_domain.c
index 641edc313938..78eac2c0c146 100644
--- a/arch/arm/mach-davinci/pm_domain.c
+++ b/arch/arm/mach-davinci/pm_domain.c
@@ -14,39 +14,9 @@
 #include <linux/pm_clock.h>
 #include <linux/platform_device.h>
 
-#ifdef CONFIG_PM
-static int davinci_pm_runtime_suspend(struct device *dev)
-{
-	int ret;
-
-	dev_dbg(dev, "%s\n", __func__);
-
-	ret = pm_generic_runtime_suspend(dev);
-	if (ret)
-		return ret;
-
-	ret = pm_clk_suspend(dev);
-	if (ret) {
-		pm_generic_runtime_resume(dev);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int davinci_pm_runtime_resume(struct device *dev)
-{
-	dev_dbg(dev, "%s\n", __func__);
-
-	pm_clk_resume(dev);
-	return pm_generic_runtime_resume(dev);
-}
-#endif
-
 static struct dev_pm_domain davinci_pm_domain = {
 	.ops = {
-		SET_RUNTIME_PM_OPS(davinci_pm_runtime_suspend,
-				   davinci_pm_runtime_resume, NULL)
+		USE_PM_CLK_RUNTIME_OPS
 		USE_PLATFORM_PM_SLEEP_OPS
 	},
 };
diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c
index c0b6dccbf7bd..7d23ce04cad5 100644
--- a/arch/arm/mach-exynos/suspend.c
+++ b/arch/arm/mach-exynos/suspend.c
@@ -87,8 +87,8 @@ static unsigned int exynos_pmu_spare3;
 static u32 exynos_irqwake_intmask = 0xffffffff;
 
 static const struct exynos_wkup_irq exynos3250_wkup_irq[] = {
-	{ 105, BIT(1) }, /* RTC alarm */
-	{ 106, BIT(2) }, /* RTC tick */
+	{ 73, BIT(1) }, /* RTC alarm */
+	{ 74, BIT(2) }, /* RTC tick */
 	{ /* sentinel */ },
 };
 
diff --git a/arch/arm/mach-gemini/gpio.c b/arch/arm/mach-gemini/gpio.c
index f8cb5710d6ee..3292f2e6ed6f 100644
--- a/arch/arm/mach-gemini/gpio.c
+++ b/arch/arm/mach-gemini/gpio.c
@@ -223,8 +223,8 @@ void __init gemini_gpio_init(void)
 			set_irq_flags(j, IRQF_VALID);
 		}
 
-		irq_set_chained_handler(IRQ_GPIO(i), gpio_irq_handler);
-		irq_set_handler_data(IRQ_GPIO(i), (void *)i);
+		irq_set_chained_handler_and_data(IRQ_GPIO(i), gpio_irq_handler,
+						 (void *)i);
 	}
 
 	BUG_ON(gpiochip_add(&gemini_gpio_chip));
diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index 4d60005e9277..6d0893a3828e 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -280,9 +280,15 @@ void __init imx_gpc_check_dt(void)
 	struct device_node *np;
 
 	np = of_find_compatible_node(NULL, NULL, "fsl,imx6q-gpc");
-	if (WARN_ON(!np ||
-		    !of_find_property(np, "interrupt-controller", NULL)))
-		pr_warn("Outdated DT detected, system is about to crash!!!\n");
+	if (WARN_ON(!np))
+		return;
+
+	if (WARN_ON(!of_find_property(np, "interrupt-controller", NULL))) {
+		pr_warn("Outdated DT detected, suspend/resume will NOT work\n");
+
+		/* map GPC, so that at least CPUidle and WARs keep working */
+		gpc_base = of_iomap(np, 0);
+	}
 }
 
 #ifdef CONFIG_PM_GENERIC_DOMAINS
@@ -443,6 +449,10 @@ static int imx_gpc_probe(struct platform_device *pdev)
 	struct regulator *pu_reg;
 	int ret;
 
+	/* bail out if DT too old and doesn't provide the necessary info */
+	if (!of_property_read_bool(pdev->dev.of_node, "#power-domain-cells"))
+		return 0;
+
 	pu_reg = devm_regulator_get_optional(&pdev->dev, "pu");
 	if (PTR_ERR(pu_reg) == -ENODEV)
 		pu_reg = NULL;
diff --git a/arch/arm/mach-keystone/pm_domain.c b/arch/arm/mach-keystone/pm_domain.c
index 41bebfd296dc..edea697e8253 100644
--- a/arch/arm/mach-keystone/pm_domain.c
+++ b/arch/arm/mach-keystone/pm_domain.c
@@ -19,40 +19,9 @@
 #include <linux/clk-provider.h>
 #include <linux/of.h>
 
-#ifdef CONFIG_PM
-static int keystone_pm_runtime_suspend(struct device *dev)
-{
-	int ret;
-
-	dev_dbg(dev, "%s\n", __func__);
-
-	ret = pm_generic_runtime_suspend(dev);
-	if (ret)
-		return ret;
-
-	ret = pm_clk_suspend(dev);
-	if (ret) {
-		pm_generic_runtime_resume(dev);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int keystone_pm_runtime_resume(struct device *dev)
-{
-	dev_dbg(dev, "%s\n", __func__);
-
-	pm_clk_resume(dev);
-
-	return pm_generic_runtime_resume(dev);
-}
-#endif
-
 static struct dev_pm_domain keystone_pm_domain = {
 	.ops = {
-		SET_RUNTIME_PM_OPS(keystone_pm_runtime_suspend,
-				   keystone_pm_runtime_resume, NULL)
+		USE_PM_CLK_RUNTIME_OPS
 		USE_PLATFORM_PM_SLEEP_OPS
 	},
 };
diff --git a/arch/arm/mach-lpc32xx/clock.c b/arch/arm/mach-lpc32xx/clock.c
index dd5d6f532e8c..661c8f4b2310 100644
--- a/arch/arm/mach-lpc32xx/clock.c
+++ b/arch/arm/mach-lpc32xx/clock.c
@@ -1238,10 +1238,7 @@ static struct clk_lookup lookups[] = {
 
 static int __init clk_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(lookups); i++)
-		clkdev_add(&lookups[i]);
+	clkdev_add_table(lookups, ARRAY_SIZE(lookups));
 
 	/*
 	 * Setup muxed SYSCLK for HCLK PLL base -this selects the
diff --git a/arch/arm/mach-omap1/board-nokia770.c b/arch/arm/mach-omap1/board-nokia770.c
index 85089d821982..3bc59390a943 100644
--- a/arch/arm/mach-omap1/board-nokia770.c
+++ b/arch/arm/mach-omap1/board-nokia770.c
@@ -7,6 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/clkdev.h>
 #include <linux/irq.h>
 #include <linux/gpio.h>
 #include <linux/kernel.h>
@@ -14,7 +15,6 @@
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
 #include <linux/input.h>
-#include <linux/clk.h>
 #include <linux/omapfb.h>
 
 #include <linux/spi/spi.h>
diff --git a/arch/arm/mach-omap1/pm_bus.c b/arch/arm/mach-omap1/pm_bus.c
index c40e209de65c..667c1637ff91 100644
--- a/arch/arm/mach-omap1/pm_bus.c
+++ b/arch/arm/mach-omap1/pm_bus.c
@@ -21,48 +21,15 @@
 
 #include "soc.h"
 
-#ifdef CONFIG_PM
-static int omap1_pm_runtime_suspend(struct device *dev)
-{
-	int ret;
-
-	dev_dbg(dev, "%s\n", __func__);
-
-	ret = pm_generic_runtime_suspend(dev);
-	if (ret)
-		return ret;
-
-	ret = pm_clk_suspend(dev);
-	if (ret) {
-		pm_generic_runtime_resume(dev);
-		return ret;
-	}
-
-	return 0;
-}
-
-static int omap1_pm_runtime_resume(struct device *dev)
-{
-	dev_dbg(dev, "%s\n", __func__);
-
-	pm_clk_resume(dev);
-	return pm_generic_runtime_resume(dev);
-}
-
 static struct dev_pm_domain default_pm_domain = {
 	.ops = {
-		.runtime_suspend = omap1_pm_runtime_suspend,
-		.runtime_resume = omap1_pm_runtime_resume,
+		USE_PM_CLK_RUNTIME_OPS
 		USE_PLATFORM_PM_SLEEP_OPS
 	},
 };
-#define OMAP1_PM_DOMAIN (&default_pm_domain)
-#else
-#define OMAP1_PM_DOMAIN NULL
-#endif /* CONFIG_PM */
 
 static struct pm_clk_notifier_block platform_bus_notifier = {
-	.pm_domain = OMAP1_PM_DOMAIN,
+	.pm_domain = &default_pm_domain,
 	.con_ids = { "ick", "fck", NULL, },
 };
 
diff --git a/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c b/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c
index 85e0b0c06718..b64d717bfab6 100644
--- a/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c
+++ b/arch/arm/mach-omap2/clkt2xxx_virt_prcm_set.c
@@ -232,14 +232,12 @@ void omap2xxx_clkt_vps_init(void)
 	struct clk_hw_omap *hw = NULL;
 	struct clk *clk;
 	const char *parent_name = "mpu_ck";
-	struct clk_lookup *lookup = NULL;
 
 	omap2xxx_clkt_vps_late_init();
 	omap2xxx_clkt_vps_check_bootloader_rates();
 
 	hw = kzalloc(sizeof(*hw), GFP_KERNEL);
-	lookup = kzalloc(sizeof(*lookup), GFP_KERNEL);
-	if (!hw || !lookup)
+	if (!hw)
 		goto cleanup;
 	init.name = "virt_prcm_set";
 	init.ops = &virt_prcm_set_ops;
@@ -249,15 +247,9 @@ void omap2xxx_clkt_vps_init(void)
 	hw->hw.init = &init;
 
 	clk = clk_register(NULL, &hw->hw);
-
-	lookup->dev_id = NULL;
-	lookup->con_id = "cpufreq_ck";
-	lookup->clk = clk;
-
-	clkdev_add(lookup);
+	clkdev_create(clk, "cpufreq_ck", NULL);
 	return;
 cleanup:
 	kfree(hw);
-	kfree(lookup);
 }
 #endif
diff --git a/arch/arm/mach-omap2/display.c b/arch/arm/mach-omap2/display.c
index f492ae147c6a..6ab13d18c636 100644
--- a/arch/arm/mach-omap2/display.c
+++ b/arch/arm/mach-omap2/display.c
@@ -287,6 +287,8 @@ static enum omapdss_version __init omap_display_get_version(void)
 		return OMAPDSS_VER_OMAP5;
 	else if (soc_is_am43xx())
 		return OMAPDSS_VER_AM43xx;
+	else if (soc_is_dra7xx())
+		return OMAPDSS_VER_DRA7xx;
 	else
 		return OMAPDSS_VER_UNKNOWN;
 }
@@ -568,25 +570,25 @@ void __init omapdss_early_init_of(void)
 
 }
 
+static const char * const omapdss_compat_names[] __initconst = {
+	"ti,omap2-dss",
+	"ti,omap3-dss",
+	"ti,omap4-dss",
+	"ti,omap5-dss",
+	"ti,dra7-dss",
+};
+
 struct device_node * __init omapdss_find_dss_of_node(void)
 {
 	struct device_node *node;
+	int i;
 
-	node = of_find_compatible_node(NULL, NULL, "ti,omap2-dss");
-	if (node)
-		return node;
-
-	node = of_find_compatible_node(NULL, NULL, "ti,omap3-dss");
-	if (node)
-		return node;
-
-	node = of_find_compatible_node(NULL, NULL, "ti,omap4-dss");
-	if (node)
-		return node;
-
-	node = of_find_compatible_node(NULL, NULL, "ti,omap5-dss");
-	if (node)
-		return node;
+	for (i = 0; i < ARRAY_SIZE(omapdss_compat_names); ++i) {
+		node = of_find_compatible_node(NULL, NULL,
+			omapdss_compat_names[i]);
+		if (node)
+			return node;
+	}
 
 	return NULL;
 }
diff --git a/arch/arm/mach-omap2/omap_device.c b/arch/arm/mach-omap2/omap_device.c
index 166b18f515a2..e417f7fcb2ba 100644
--- a/arch/arm/mach-omap2/omap_device.c
+++ b/arch/arm/mach-omap2/omap_device.c
@@ -47,7 +47,7 @@ static void _add_clkdev(struct omap_device *od, const char *clk_alias,
 		       const char *clk_name)
 {
 	struct clk *r;
-	struct clk_lookup *l;
+	int rc;
 
 	if (!clk_alias || !clk_name)
 		return;
@@ -62,21 +62,15 @@ static void _add_clkdev(struct omap_device *od, const char *clk_alias,
 		return;
 	}
 
-	r = clk_get(NULL, clk_name);
-	if (IS_ERR(r)) {
-		dev_err(&od->pdev->dev,
-			"clk_get for %s failed\n", clk_name);
-		return;
+	rc = clk_add_alias(clk_alias, dev_name(&od->pdev->dev), clk_name, NULL);
+	if (rc) {
+		if (rc == -ENODEV || rc == -ENOMEM)
+			dev_err(&od->pdev->dev,
+				"clkdev_alloc for %s failed\n", clk_alias);
+		else
+			dev_err(&od->pdev->dev,
+				"clk_get for %s failed\n", clk_name);
 	}
-
-	l = clkdev_alloc(r, clk_alias, dev_name(&od->pdev->dev));
-	if (!l) {
-		dev_err(&od->pdev->dev,
-			"clkdev_alloc for %s failed\n", clk_alias);
-		return;
-	}
-
-	clkdev_add(l);
 }
 
 /**
@@ -688,11 +682,8 @@ struct dev_pm_domain omap_device_pm_domain = {
 		SET_RUNTIME_PM_OPS(_od_runtime_suspend, _od_runtime_resume,
 				   NULL)
 		USE_PLATFORM_PM_SLEEP_OPS
-		.suspend_noirq = _od_suspend_noirq,
-		.resume_noirq = _od_resume_noirq,
-		.freeze_noirq = _od_suspend_noirq,
-		.thaw_noirq = _od_resume_noirq,
-		.restore_noirq = _od_resume_noirq,
+		SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(_od_suspend_noirq,
+					      _od_resume_noirq)
 	}
 };
 
diff --git a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c
index 0e64c2fac0b5..9961f95f52ae 100644
--- a/arch/arm/mach-omap2/omap_hwmod_7xx_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_7xx_data.c
@@ -49,6 +49,27 @@
  */
 
 /*
+ * 'dmm' class
+ * instance(s): dmm
+ */
+static struct omap_hwmod_class dra7xx_dmm_hwmod_class = {
+	.name	= "dmm",
+};
+
+/* dmm */
+static struct omap_hwmod dra7xx_dmm_hwmod = {
+	.name		= "dmm",
+	.class		= &dra7xx_dmm_hwmod_class,
+	.clkdm_name	= "emif_clkdm",
+	.prcm = {
+		.omap4 = {
+			.clkctrl_offs = DRA7XX_CM_EMIF_DMM_CLKCTRL_OFFSET,
+			.context_offs = DRA7XX_RM_EMIF_DMM_CONTEXT_OFFSET,
+		},
+	},
+};
+
+/*
  * 'l3' class
  * instance(s): l3_instr, l3_main_1, l3_main_2
  */
@@ -438,6 +459,7 @@ static struct omap_hwmod_opt_clk dss_opt_clks[] = {
 	{ .role = "video2_clk", .clk = "dss_video2_clk" },
 	{ .role = "video1_clk", .clk = "dss_video1_clk" },
 	{ .role = "hdmi_clk", .clk = "dss_hdmi_clk" },
+	{ .role = "hdcp_clk", .clk = "dss_deshdcp_clk" },
 };
 
 static struct omap_hwmod dra7xx_dss_hwmod = {
@@ -500,6 +522,7 @@ static struct omap_hwmod dra7xx_dss_dispc_hwmod = {
 		},
 	},
 	.dev_attr	= &dss_dispc_dev_attr,
+	.parent_hwmod	= &dra7xx_dss_hwmod,
 };
 
 /*
@@ -541,6 +564,7 @@ static struct omap_hwmod dra7xx_dss_hdmi_hwmod = {
 	},
 	.opt_clks	= dss_hdmi_opt_clks,
 	.opt_clks_cnt	= ARRAY_SIZE(dss_hdmi_opt_clks),
+	.parent_hwmod	= &dra7xx_dss_hwmod,
 };
 
 /*
@@ -2321,6 +2345,14 @@ static struct omap_hwmod dra7xx_wd_timer2_hwmod = {
  * Interfaces
  */
 
+/* l3_main_1 -> dmm */
+static struct omap_hwmod_ocp_if dra7xx_l3_main_1__dmm = {
+	.master		= &dra7xx_l3_main_1_hwmod,
+	.slave		= &dra7xx_dmm_hwmod,
+	.clk		= "l3_iclk_div",
+	.user		= OCP_USER_SDMA,
+};
+
 /* l3_main_2 -> l3_instr */
 static struct omap_hwmod_ocp_if dra7xx_l3_main_2__l3_instr = {
 	.master		= &dra7xx_l3_main_2_hwmod,
@@ -3289,6 +3321,7 @@ static struct omap_hwmod_ocp_if dra7xx_l4_wkup__wd_timer2 = {
 };
 
 static struct omap_hwmod_ocp_if *dra7xx_hwmod_ocp_ifs[] __initdata = {
+	&dra7xx_l3_main_1__dmm,
 	&dra7xx_l3_main_2__l3_instr,
 	&dra7xx_l4_cfg__l3_main_1,
 	&dra7xx_mpu__l3_main_1,
diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S
index d1dedc8195ed..eafd120b53f1 100644
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -203,23 +203,8 @@ save_context_wfi:
 	 */
 	ldr	r1, kernel_flush
 	blx	r1
-	/*
-	 * The kernel doesn't interwork: v7_flush_dcache_all in particluar will
-	 * always return in Thumb state when CONFIG_THUMB2_KERNEL is enabled.
-	 * This sequence switches back to ARM.  Note that .align may insert a
-	 * nop: bx pc needs to be word-aligned in order to work.
-	 */
- THUMB(	.thumb		)
- THUMB(	.align		)
- THUMB(	bx	pc	)
- THUMB(	nop		)
-	.arm
-
 	b	omap3_do_wfi
-
-/*
- * Local variables
- */
+ENDPROC(omap34xx_cpu_suspend)
 omap3_do_wfi_sram_addr:
 	.word omap3_do_wfi_sram
 kernel_flush:
@@ -364,10 +349,7 @@ exit_nonoff_modes:
  * ===================================
  */
 	ldmfd	sp!, {r4 - r11, pc}	@ restore regs and return
-
-/*
- * Local variables
- */
+ENDPROC(omap3_do_wfi)
 sdrc_power:
 	.word	SDRC_POWER_V
 cm_idlest1_core:
diff --git a/arch/arm/mach-pxa/eseries.c b/arch/arm/mach-pxa/eseries.c
index cfb864173ce3..4427bf26ea47 100644
--- a/arch/arm/mach-pxa/eseries.c
+++ b/arch/arm/mach-pxa/eseries.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <linux/clkdev.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/gpio.h>
diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c
index 4ac9ab80d24b..7518310c9015 100644
--- a/arch/arm/mach-pxa/lubbock.c
+++ b/arch/arm/mach-pxa/lubbock.c
@@ -11,6 +11,7 @@
  *  it under the terms of the GNU General Public License version 2 as
  *  published by the Free Software Foundation.
  */
+#include <linux/clkdev.h>
 #include <linux/gpio.h>
 #include <linux/gpio/machine.h>
 #include <linux/module.h>
diff --git a/arch/arm/mach-pxa/pxa_cplds_irqs.c b/arch/arm/mach-pxa/pxa_cplds_irqs.c
index f1aeb54fabe3..2385052b0ce1 100644
--- a/arch/arm/mach-pxa/pxa_cplds_irqs.c
+++ b/arch/arm/mach-pxa/pxa_cplds_irqs.c
@@ -107,7 +107,7 @@ static int cplds_probe(struct platform_device *pdev)
 	struct resource *res;
 	struct cplds *fpga;
 	int ret;
-	unsigned int base_irq = 0;
+	int base_irq;
 	unsigned long irqflags = 0;
 
 	fpga = devm_kzalloc(&pdev->dev, sizeof(*fpga), GFP_KERNEL);
diff --git a/arch/arm/mach-pxa/tosa.c b/arch/arm/mach-pxa/tosa.c
index 7780d1faa06f..92e56d8a24d8 100644
--- a/arch/arm/mach-pxa/tosa.c
+++ b/arch/arm/mach-pxa/tosa.c
@@ -12,6 +12,7 @@
  *
  */
 
+#include <linux/clkdev.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index af868d258e66..99d9a3b1bf34 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -327,8 +327,7 @@ static int neponset_probe(struct platform_device *dev)
 	irq_set_chip(d->irq_base + NEP_IRQ_SA1111, &nochip);
 
 	irq_set_irq_type(irq, IRQ_TYPE_EDGE_RISING);
-	irq_set_handler_data(irq, d);
-	irq_set_chained_handler(irq, neponset_irq_handler);
+	irq_set_chained_handler_and_data(irq, neponset_irq_handler, d);
 
 	/*
 	 * We would set IRQ_GPIO25 to be a wake-up IRQ, but unfortunately
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index 6333d9c17875..0d629b8f973f 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -276,7 +276,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c
index b98895d9fe57..ee8dfa793989 100644
--- a/arch/arm/mm/highmem.c
+++ b/arch/arm/mm/highmem.c
@@ -59,6 +59,7 @@ void *kmap_atomic(struct page *page)
 	void *kmap;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -121,6 +122,7 @@ void __kunmap_atomic(void *kvaddr)
 		kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
 	}
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -130,6 +132,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	int idx, type;
 	struct page *page = pfn_to_page(pfn);
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index be92fa0f2f35..8a63b4cdc0f2 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -268,6 +268,7 @@ void __init arm_memblock_init(const struct machine_desc *mdesc)
 	if (mdesc->reserve)
 		mdesc->reserve();
 
+	early_init_fdt_reserve_self();
 	early_init_fdt_scan_reserved_mem();
 
 	/* reserve memory for DMA contiguous allocations */
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 4e6ef896c619..7186382672b5 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1112,22 +1112,22 @@ void __init sanity_check_meminfo(void)
 			}
 
 			/*
-			 * Find the first non-section-aligned page, and point
+			 * Find the first non-pmd-aligned page, and point
 			 * memblock_limit at it. This relies on rounding the
-			 * limit down to be section-aligned, which happens at
-			 * the end of this function.
+			 * limit down to be pmd-aligned, which happens at the
+			 * end of this function.
 			 *
 			 * With this algorithm, the start or end of almost any
-			 * bank can be non-section-aligned. The only exception
-			 * is that the start of the bank 0 must be section-
+			 * bank can be non-pmd-aligned. The only exception is
+			 * that the start of the bank 0 must be section-
 			 * aligned, since otherwise memory would need to be
 			 * allocated when mapping the start of bank 0, which
 			 * occurs before any free memory is mapped.
 			 */
 			if (!memblock_limit) {
-				if (!IS_ALIGNED(block_start, SECTION_SIZE))
+				if (!IS_ALIGNED(block_start, PMD_SIZE))
 					memblock_limit = block_start;
-				else if (!IS_ALIGNED(block_end, SECTION_SIZE))
+				else if (!IS_ALIGNED(block_end, PMD_SIZE))
 					memblock_limit = arm_lowmem_limit;
 			}
 
@@ -1137,12 +1137,12 @@ void __init sanity_check_meminfo(void)
 	high_memory = __va(arm_lowmem_limit - 1) + 1;
 
 	/*
-	 * Round the memblock limit down to a section size.  This
+	 * Round the memblock limit down to a pmd size.  This
 	 * helps to ensure that we will allocate memory from the
-	 * last full section, which should be mapped.
+	 * last full pmd, which should be mapped.
 	 */
 	if (memblock_limit)
-		memblock_limit = round_down(memblock_limit, SECTION_SIZE);
+		memblock_limit = round_down(memblock_limit, PMD_SIZE);
 	if (!memblock_limit)
 		memblock_limit = arm_lowmem_limit;
 
diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index e0e23582c8b4..4550d247e308 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -873,6 +873,16 @@ b_epilogue:
 			off = offsetof(struct sk_buff, queue_mapping);
 			emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
 			break;
+		case BPF_LDX | BPF_W | BPF_ABS:
+			/*
+			 * load a 32bit word from struct seccomp_data.
+			 * seccomp_check_filter() will already have checked
+			 * that k is 32bit aligned and lies within the
+			 * struct seccomp_data.
+			 */
+			ctx->seen |= SEEN_SKB;
+			emit(ARM_LDR_I(r_A, r_skb, k), ctx);
+			break;
 		default:
 			return -1;
 		}
diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c
index f5b00f41c4f6..2235081a04ee 100644
--- a/arch/arm/plat-orion/common.c
+++ b/arch/arm/plat-orion/common.c
@@ -28,11 +28,7 @@
 void __init orion_clkdev_add(const char *con_id, const char *dev_id,
 			     struct clk *clk)
 {
-	struct clk_lookup *cl;
-
-	cl = clkdev_alloc(clk, con_id, dev_id);
-	if (cl)
-		clkdev_add(cl);
+	clkdev_create(clk, con_id, "%s", dev_id);
 }
 
 /* Create clkdev entries for all orion platforms except kirkwood.
diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 224081ccc92f..7d0f07020c80 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -272,6 +272,7 @@ void xen_arch_pre_suspend(void) { }
 void xen_arch_post_suspend(int suspend_cancelled) { }
 void xen_timer_resume(void) { }
 void xen_arch_resume(void) { }
+void xen_arch_suspend(void) { }
 
 
 /* In the hypervisor.S file. */
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7796af4b1d6f..290ed648aa11 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,5 +1,6 @@
 config ARM64
 	def_bool y
+	select ACPI_CCA_REQUIRED if ACPI
 	select ACPI_GENERIC_GSI if ACPI
 	select ACPI_REDUCED_HARDWARE_ONLY if ACPI
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
@@ -22,6 +23,7 @@ config ARM64
 	select BUILDTIME_EXTABLE_SORT
 	select CLONE_BACKWARDS
 	select COMMON_CLK
+	select EDAC_SUPPORT
 	select CPU_PM if (SUSPEND || CPU_IDLE)
 	select DCACHE_WORD_ACCESS
 	select GENERIC_ALLOCATOR
@@ -71,6 +73,7 @@ config ARM64
 	select HAVE_RCU_TABLE_FREE
 	select HAVE_SYSCALL_TRACEPOINTS
 	select IRQ_DOMAIN
+	select IRQ_FORCED_THREADING
 	select MODULES_USE_ELF_RELA
 	select NO_BOOTMEM
 	select OF
diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi b/arch/arm64/boot/dts/apm/apm-storm.dtsi
index c8d3e0e86678..0bb287ca0a98 100644
--- a/arch/arm64/boot/dts/apm/apm-storm.dtsi
+++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi
@@ -374,6 +374,111 @@
 			};
 		};
 
+		msi: msi@79000000 {
+			compatible = "apm,xgene1-msi";
+			msi-controller;
+			reg = <0x00 0x79000000 0x0 0x900000>;
+			interrupts = <  0x0 0x10 0x4
+					0x0 0x11 0x4
+					0x0 0x12 0x4
+					0x0 0x13 0x4
+					0x0 0x14 0x4
+					0x0 0x15 0x4
+					0x0 0x16 0x4
+					0x0 0x17 0x4
+					0x0 0x18 0x4
+					0x0 0x19 0x4
+					0x0 0x1a 0x4
+					0x0 0x1b 0x4
+					0x0 0x1c 0x4
+					0x0 0x1d 0x4
+					0x0 0x1e 0x4
+					0x0 0x1f 0x4>;
+		};
+
+		csw: csw@7e200000 {
+			compatible = "apm,xgene-csw", "syscon";
+			reg = <0x0 0x7e200000 0x0 0x1000>;
+		};
+
+		mcba: mcba@7e700000 {
+			compatible = "apm,xgene-mcb", "syscon";
+			reg = <0x0 0x7e700000 0x0 0x1000>;
+		};
+
+		mcbb: mcbb@7e720000 {
+			compatible = "apm,xgene-mcb", "syscon";
+			reg = <0x0 0x7e720000 0x0 0x1000>;
+		};
+
+		efuse: efuse@1054a000 {
+			compatible = "apm,xgene-efuse", "syscon";
+			reg = <0x0 0x1054a000 0x0 0x20>;
+		};
+
+		edac@78800000 {
+			compatible = "apm,xgene-edac";
+			#address-cells = <2>;
+			#size-cells = <2>;
+			ranges;
+			regmap-csw = <&csw>;
+			regmap-mcba = <&mcba>;
+			regmap-mcbb = <&mcbb>;
+			regmap-efuse = <&efuse>;
+			reg = <0x0 0x78800000 0x0 0x100>;
+			interrupts = <0x0 0x20 0x4>,
+				     <0x0 0x21 0x4>,
+				     <0x0 0x27 0x4>;
+
+			edacmc@7e800000 {
+				compatible = "apm,xgene-edac-mc";
+				reg = <0x0 0x7e800000 0x0 0x1000>;
+				memory-controller = <0>;
+			};
+
+			edacmc@7e840000 {
+				compatible = "apm,xgene-edac-mc";
+				reg = <0x0 0x7e840000 0x0 0x1000>;
+				memory-controller = <1>;
+			};
+
+			edacmc@7e880000 {
+				compatible = "apm,xgene-edac-mc";
+				reg = <0x0 0x7e880000 0x0 0x1000>;
+				memory-controller = <2>;
+			};
+
+			edacmc@7e8c0000 {
+				compatible = "apm,xgene-edac-mc";
+				reg = <0x0 0x7e8c0000 0x0 0x1000>;
+				memory-controller = <3>;
+			};
+
+			edacpmd@7c000000 {
+				compatible = "apm,xgene-edac-pmd";
+				reg = <0x0 0x7c000000 0x0 0x200000>;
+				pmd-controller = <0>;
+			};
+
+			edacpmd@7c200000 {
+				compatible = "apm,xgene-edac-pmd";
+				reg = <0x0 0x7c200000 0x0 0x200000>;
+				pmd-controller = <1>;
+			};
+
+			edacpmd@7c400000 {
+				compatible = "apm,xgene-edac-pmd";
+				reg = <0x0 0x7c400000 0x0 0x200000>;
+				pmd-controller = <2>;
+			};
+
+			edacpmd@7c600000 {
+				compatible = "apm,xgene-edac-pmd";
+				reg = <0x0 0x7c600000 0x0 0x200000>;
+				pmd-controller = <3>;
+			};
+		};
+
 		pcie0: pcie@1f2b0000 {
 			status = "disabled";
 			device_type = "pci";
@@ -395,6 +500,7 @@
 					 0x0 0x0 0x0 0x4 &gic 0x0 0xc5 0x1>;
 			dma-coherent;
 			clocks = <&pcie0clk 0>;
+			msi-parent = <&msi>;
 		};
 
 		pcie1: pcie@1f2c0000 {
@@ -418,6 +524,7 @@
 					 0x0 0x0 0x0 0x4 &gic 0x0 0xcb 0x1>;
 			dma-coherent;
 			clocks = <&pcie1clk 0>;
+			msi-parent = <&msi>;
 		};
 
 		pcie2: pcie@1f2d0000 {
@@ -441,6 +548,7 @@
 					 0x0 0x0 0x0 0x4 &gic 0x0 0xd1 0x1>;
 			dma-coherent;
 			clocks = <&pcie2clk 0>;
+			msi-parent = <&msi>;
 		};
 
 		pcie3: pcie@1f500000 {
@@ -464,6 +572,7 @@
 					 0x0 0x0 0x0 0x4 &gic 0x0 0xd7 0x1>;
 			dma-coherent;
 			clocks = <&pcie3clk 0>;
+			msi-parent = <&msi>;
 		};
 
 		pcie4: pcie@1f510000 {
@@ -487,6 +596,7 @@
 					 0x0 0x0 0x0 0x4 &gic 0x0 0xdd 0x1>;
 			dma-coherent;
 			clocks = <&pcie4clk 0>;
+			msi-parent = <&msi>;
 		};
 
 		serial0: serial@1c020000 {
diff --git a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
index 43d54017b779..d0ab012fa379 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
+++ b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts
@@ -16,7 +16,8 @@
 #include "mt8173.dtsi"
 
 / {
-	model = "mediatek,mt8173-evb";
+	model = "MediaTek MT8173 evaluation board";
+	compatible = "mediatek,mt8173-evb", "mediatek,mt8173";
 
 	aliases {
 		serial0 = &uart0;
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 2ed7449d9273..daefbf0329a6 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -180,6 +180,7 @@ CONFIG_LOCKUP_DETECTOR=y
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_DEBUG_PREEMPT is not set
 # CONFIG_FTRACE is not set
+CONFIG_MEMTEST=y
 CONFIG_SECURITY=y
 CONFIG_CRYPTO_ANSI_CPRNG=y
 CONFIG_ARM64_CRYPTO=y
diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c
index 6c348df5bf36..3303e8a7b837 100644
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@@ -13,7 +13,7 @@
 #include <crypto/aes.h>
 #include <crypto/algapi.h>
 #include <crypto/scatterwalk.h>
-#include <linux/crypto.h>
+#include <crypto/internal/aead.h>
 #include <linux/module.h>
 
 #include "aes-ce-setkey.h"
diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 59c05d8ea4a0..39248d3adf5d 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -16,6 +16,7 @@
 #include <linux/irqchip/arm-gic-acpi.h>
 
 #include <asm/cputype.h>
+#include <asm/psci.h>
 #include <asm/smp_plat.h>
 
 /* Basic configuration for ACPI */
@@ -39,18 +40,6 @@ extern int acpi_disabled;
 extern int acpi_noirq;
 extern int acpi_pci_disabled;
 
-/* 1 to indicate PSCI 0.2+ is implemented */
-static inline bool acpi_psci_present(void)
-{
-	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_COMPLIANT;
-}
-
-/* 1 to indicate HVC must be used instead of SMC as the PSCI conduit */
-static inline bool acpi_psci_use_hvc(void)
-{
-	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
-}
-
 static inline void disable_acpi(void)
 {
 	acpi_disabled = 1;
@@ -88,9 +77,11 @@ static inline void arch_fix_phys_package_id(int num, u32 slot) { }
 void __init acpi_init_cpus(void);
 
 #else
-static inline bool acpi_psci_present(void) { return false; }
-static inline bool acpi_psci_use_hvc(void) { return false; }
 static inline void acpi_init_cpus(void) { }
 #endif /* CONFIG_ACPI */
 
+static inline const char *acpi_get_enable_method(int cpu)
+{
+	return acpi_psci_present() ? "psci" : NULL;
+}
 #endif /*_ASM_ACPI_H*/
diff --git a/arch/arm64/include/asm/alternative-asm.h b/arch/arm64/include/asm/alternative-asm.h
deleted file mode 100644
index 919a67855b63..000000000000
--- a/arch/arm64/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __ASM_ALTERNATIVE_ASM_H
-#define __ASM_ALTERNATIVE_ASM_H
-
-#ifdef __ASSEMBLY__
-
-.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
-	.word \orig_offset - .
-	.word \alt_offset - .
-	.hword \feature
-	.byte \orig_len
-	.byte \alt_len
-.endm
-
-.macro alternative_insn insn1 insn2 cap
-661:	\insn1
-662:	.pushsection .altinstructions, "a"
-	altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f
-	.popsection
-	.pushsection .altinstr_replacement, "ax"
-663:	\insn2
-664:	.popsection
-	.if ((664b-663b) != (662b-661b))
-		.error "Alternatives instruction length mismatch"
-	.endif
-.endm
-
-#endif  /*  __ASSEMBLY__  */
-
-#endif /* __ASM_ALTERNATIVE_ASM_H */
diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h
index d261f01e2bae..c385a0c4057f 100644
--- a/arch/arm64/include/asm/alternative.h
+++ b/arch/arm64/include/asm/alternative.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_ALTERNATIVE_H
 #define __ASM_ALTERNATIVE_H
 
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
@@ -24,7 +26,20 @@ void free_alternatives_memory(void);
 	" .byte 662b-661b\n"				/* source len      */ \
 	" .byte 664f-663f\n"				/* replacement len */
 
-/* alternative assembly primitive: */
+/*
+ * alternative assembly primitive:
+ *
+ * If any of these .org directive fail, it means that insn1 and insn2
+ * don't have the same length. This used to be written as
+ *
+ * .if ((664b-663b) != (662b-661b))
+ * 	.error "Alternatives instruction length mismatch"
+ * .endif
+ *
+ * but most assemblers die if insn1 or insn2 have a .inst. This should
+ * be fixed in a binutils release posterior to 2.25.51.0.2 (anything
+ * containing commit 4e4d08cf7399b606 or c1baaddf8861).
+ */
 #define ALTERNATIVE(oldinstr, newinstr, feature)			\
 	"661:\n\t"							\
 	oldinstr "\n"							\
@@ -37,8 +52,31 @@ void free_alternatives_memory(void);
 	newinstr "\n"							\
 	"664:\n\t"							\
 	".popsection\n\t"						\
-	".if ((664b-663b) != (662b-661b))\n\t"				\
-	"	.error \"Alternatives instruction length mismatch\"\n\t"\
-	".endif\n"
+	".org	. - (664b-663b) + (662b-661b)\n\t"			\
+	".org	. - (662b-661b) + (664b-663b)\n"
+
+#else
+
+.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len
+	.word \orig_offset - .
+	.word \alt_offset - .
+	.hword \feature
+	.byte \orig_len
+	.byte \alt_len
+.endm
+
+.macro alternative_insn insn1 insn2 cap
+661:	\insn1
+662:	.pushsection .altinstructions, "a"
+	altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f
+	.popsection
+	.pushsection .altinstr_replacement, "ax"
+663:	\insn2
+664:	.popsection
+	.org	. - (664b-663b) + (662b-661b)
+	.org	. - (662b-661b) + (664b-663b)
+.endm
+
+#endif  /*  __ASSEMBLY__  */
 
 #endif /* __ASM_ALTERNATIVE_H */
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 71f19c4dc0de..0fa47c4275cb 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -114,7 +114,7 @@ do {									\
 #define read_barrier_depends()		do { } while(0)
 #define smp_read_barrier_depends()	do { } while(0)
 
-#define set_mb(var, value)	do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define nop()		asm volatile("nop");
 
 #define smp_mb__before_atomic()	smp_mb()
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
new file mode 100644
index 000000000000..81151b67b26b
--- /dev/null
+++ b/arch/arm64/include/asm/boot.h
@@ -0,0 +1,14 @@
+
+#ifndef __ASM_BOOT_H
+#define __ASM_BOOT_H
+
+#include <asm/sizes.h>
+
+/*
+ * arm64 requires the DTB to be 8 byte aligned and
+ * not exceed 2MB in size.
+ */
+#define MIN_FDT_ALIGN		8
+#define MAX_FDT_SIZE		SZ_2M
+
+#endif
diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h
index 67d309cc3b6b..c75b8d027eb1 100644
--- a/arch/arm64/include/asm/cacheflush.h
+++ b/arch/arm64/include/asm/cacheflush.h
@@ -40,10 +40,6 @@
  *	the implementation assumes non-aliasing VIPT D-cache and (aliasing)
  *	VIPT or ASID-tagged VIVT I-cache.
  *
- *	flush_cache_all()
- *
- *		Unconditionally clean and invalidate the entire cache.
- *
  *	flush_cache_mm(mm)
  *
  *		Clean and invalidate all user space cache entries
@@ -69,7 +65,6 @@
  *		- kaddr  - page address
  *		- size   - region size
  */
-extern void flush_cache_all(void);
 extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void flush_icache_range(unsigned long start, unsigned long end);
 extern void __flush_dcache_area(void *addr, size_t len);
diff --git a/arch/arm64/include/asm/cpu_ops.h b/arch/arm64/include/asm/cpu_ops.h
index 5a31d6716914..8f03446cf89f 100644
--- a/arch/arm64/include/asm/cpu_ops.h
+++ b/arch/arm64/include/asm/cpu_ops.h
@@ -19,15 +19,15 @@
 #include <linux/init.h>
 #include <linux/threads.h>
 
-struct device_node;
-
 /**
  * struct cpu_operations - Callback operations for hotplugging CPUs.
  *
  * @name:	Name of the property as appears in a devicetree cpu node's
- *		enable-method property.
- * @cpu_init:	Reads any data necessary for a specific enable-method from the
- *		devicetree, for a given cpu node and proposed logical id.
+ *		enable-method property. On systems booting with ACPI, @name
+ *		identifies the struct cpu_operations entry corresponding to
+ *		the boot protocol specified in the ACPI MADT table.
+ * @cpu_init:	Reads any data necessary for a specific enable-method for a
+ *		proposed logical id.
  * @cpu_prepare: Early one-time preparation step for a cpu. If there is a
  *		mechanism for doing so, tests whether it is possible to boot
  *		the given CPU.
@@ -40,15 +40,15 @@ struct device_node;
  * @cpu_die:	Makes a cpu leave the kernel. Must not fail. Called from the
  *		cpu being killed.
  * @cpu_kill:  Ensures a cpu has left the kernel. Called from another cpu.
- * @cpu_init_idle: Reads any data necessary to initialize CPU idle states from
- *		devicetree, for a given cpu node and proposed logical id.
+ * @cpu_init_idle: Reads any data necessary to initialize CPU idle states for
+ *		   a proposed logical id.
  * @cpu_suspend: Suspends a cpu and saves the required context. May fail owing
  *               to wrong parameters or error conditions. Called from the
  *               CPU being suspended. Must be called with IRQs disabled.
  */
 struct cpu_operations {
 	const char	*name;
-	int		(*cpu_init)(struct device_node *, unsigned int);
+	int		(*cpu_init)(unsigned int);
 	int		(*cpu_prepare)(unsigned int);
 	int		(*cpu_boot)(unsigned int);
 	void		(*cpu_postboot)(void);
@@ -58,14 +58,17 @@ struct cpu_operations {
 	int		(*cpu_kill)(unsigned int cpu);
 #endif
 #ifdef CONFIG_CPU_IDLE
-	int		(*cpu_init_idle)(struct device_node *, unsigned int);
+	int		(*cpu_init_idle)(unsigned int);
 	int		(*cpu_suspend)(unsigned long);
 #endif
 };
 
 extern const struct cpu_operations *cpu_ops[NR_CPUS];
-int __init cpu_read_ops(struct device_node *dn, int cpu);
-void __init cpu_read_bootcpu_ops(void);
-const struct cpu_operations *cpu_get_ops(const char *name);
+int __init cpu_read_ops(int cpu);
+
+static inline void __init cpu_read_bootcpu_ops(void)
+{
+	cpu_read_ops(0);
+}
 
 #endif /* ifndef __ASM_CPU_OPS_H */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 82cb9f98ba1a..c1044218a63a 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -24,8 +24,9 @@
 #define ARM64_WORKAROUND_CLEAN_CACHE		0
 #define ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE	1
 #define ARM64_WORKAROUND_845719			2
+#define ARM64_HAS_SYSREG_GIC_CPUIF		3
 
-#define ARM64_NCAPS				3
+#define ARM64_NCAPS				4
 
 #ifndef __ASSEMBLY__
 
@@ -38,6 +39,11 @@ struct arm64_cpu_capabilities {
 			u32 midr_model;
 			u32 midr_range_min, midr_range_max;
 		};
+
+		struct {	/* Feature register checking */
+			u64 register_mask;
+			u64 register_value;
+		};
 	};
 };
 
diff --git a/arch/arm64/include/asm/cpuidle.h b/arch/arm64/include/asm/cpuidle.h
index 141b2fcabaa6..0f74f05d662a 100644
--- a/arch/arm64/include/asm/cpuidle.h
+++ b/arch/arm64/include/asm/cpuidle.h
@@ -5,20 +5,16 @@
 
 #ifdef CONFIG_CPU_IDLE
 extern int arm_cpuidle_init(unsigned int cpu);
-extern int cpu_suspend(unsigned long arg);
+extern int arm_cpuidle_suspend(int index);
 #else
 static inline int arm_cpuidle_init(unsigned int cpu)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int cpu_suspend(unsigned long arg)
+static inline int arm_cpuidle_suspend(int index)
 {
 	return -EOPNOTSUPP;
 }
 #endif
-static inline int arm_cpuidle_suspend(int index)
-{
-	return cpu_suspend(index);
-}
 #endif
diff --git a/arch/arm64/include/asm/dma-mapping.h b/arch/arm64/include/asm/dma-mapping.h
index 9437e3dc5833..f0d6d0bfe55c 100644
--- a/arch/arm64/include/asm/dma-mapping.h
+++ b/arch/arm64/include/asm/dma-mapping.h
@@ -18,6 +18,7 @@
 
 #ifdef __KERNEL__
 
+#include <linux/acpi.h>
 #include <linux/types.h>
 #include <linux/vmalloc.h>
 
@@ -28,13 +29,23 @@
 
 #define DMA_ERROR_CODE	(~(dma_addr_t)0)
 extern struct dma_map_ops *dma_ops;
+extern struct dma_map_ops dummy_dma_ops;
 
 static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
-	if (unlikely(!dev) || !dev->archdata.dma_ops)
+	if (unlikely(!dev))
 		return dma_ops;
-	else
+	else if (dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
+	else if (acpi_disabled)
+		return dma_ops;
+
+	/*
+	 * When ACPI is enabled, if arch_set_dma_ops is not called,
+	 * we will disable device DMA capability by setting it
+	 * to dummy_dma_ops.
+	 */
+	return &dummy_dma_ops;
 }
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
@@ -48,6 +59,9 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
 static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 				      struct iommu_ops *iommu, bool coherent)
 {
+	if (!acpi_disabled && !dev->archdata.dma_ops)
+		dev->archdata.dma_ops = dma_ops;
+
 	dev->archdata.dma_coherent = coherent;
 }
 #define arch_setup_dma_ops	arch_setup_dma_ops
diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 95e6b6dcbe37..c0739187a920 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -17,6 +17,7 @@
 
 #ifndef __ASSEMBLY__
 #include <linux/kernel.h>
+#include <asm/boot.h>
 #include <asm/page.h>
 
 /*
@@ -32,6 +33,20 @@
  */
 enum fixed_addresses {
 	FIX_HOLE,
+
+	/*
+	 * Reserve a virtual window for the FDT that is 2 MB larger than the
+	 * maximum supported size, and put it at the top of the fixmap region.
+	 * The additional space ensures that any FDT that does not exceed
+	 * MAX_FDT_SIZE can be mapped regardless of whether it crosses any
+	 * 2 MB alignment boundaries.
+	 *
+	 * Keep this at the top so it remains 2 MB aligned.
+	 */
+#define FIX_FDT_SIZE		(MAX_FDT_SIZE + SZ_2M)
+	FIX_FDT_END,
+	FIX_FDT = FIX_FDT_END + FIX_FDT_SIZE / PAGE_SIZE - 1,
+
 	FIX_EARLYCON_MEM_BASE,
 	FIX_TEXT_POKE0,
 	__end_of_permanent_fixed_addresses,
diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h
index 5f750dc96e0f..74069b3bd919 100644
--- a/arch/arm64/include/asm/futex.h
+++ b/arch/arm64/include/asm/futex.h
@@ -58,7 +58,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
 
-	pagefault_disable();	/* implies preempt_disable() */
+	pagefault_disable();
 
 	switch (op) {
 	case FUTEX_OP_SET:
@@ -85,7 +85,7 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 		ret = -ENOSYS;
 	}
 
-	pagefault_enable();	/* subsumes preempt_enable() */
+	pagefault_enable();
 
 	if (!ret) {
 		switch (cmp) {
diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index f81b328d9cf4..30e50eb54a67 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -281,6 +281,7 @@ __AARCH64_INSN_FUNCS(ret,	0xFFFFFC1F, 0xD65F0000)
 #undef	__AARCH64_INSN_FUNCS
 
 bool aarch64_insn_is_nop(u32 insn);
+bool aarch64_insn_is_branch_imm(u32 insn);
 
 int aarch64_insn_read(void *addr, u32 *insnp);
 int aarch64_insn_write(void *addr, u32 insn);
@@ -351,6 +352,8 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
 					 int shift,
 					 enum aarch64_insn_variant variant,
 					 enum aarch64_insn_logic_type type);
+s32 aarch64_get_branch_offset(u32 insn);
+u32 aarch64_set_branch_offset(u32 insn, s32 offset);
 
 bool aarch64_insn_hotpatch_safe(u32 old_insn, u32 new_insn);
 
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 540f7c0aea82..44be1e03ed65 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -117,10 +117,10 @@ static inline u64 __raw_readq(const volatile void __iomem *addr)
  * ordering rules but do not guarantee any ordering relative to Normal memory
  * accesses.
  */
-#define readb_relaxed(c)	({ u8  __v = __raw_readb(c); __v; })
-#define readw_relaxed(c)	({ u16 __v = le16_to_cpu((__force __le16)__raw_readw(c)); __v; })
-#define readl_relaxed(c)	({ u32 __v = le32_to_cpu((__force __le32)__raw_readl(c)); __v; })
-#define readq_relaxed(c)	({ u64 __v = le64_to_cpu((__force __le64)__raw_readq(c)); __v; })
+#define readb_relaxed(c)	({ u8  __r = __raw_readb(c); __r; })
+#define readw_relaxed(c)	({ u16 __r = le16_to_cpu((__force __le16)__raw_readw(c)); __r; })
+#define readl_relaxed(c)	({ u32 __r = le32_to_cpu((__force __le32)__raw_readl(c)); __r; })
+#define readq_relaxed(c)	({ u64 __r = le64_to_cpu((__force __le64)__raw_readq(c)); __r; })
 
 #define writeb_relaxed(v,c)	((void)__raw_writeb((v),(c)))
 #define writew_relaxed(v,c)	((void)__raw_writew((__force u16)cpu_to_le16(v),(c)))
@@ -170,6 +170,7 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size);
 #define ioremap(addr, size)		__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
 #define ioremap_nocache(addr, size)	__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
 #define ioremap_wc(addr, size)		__ioremap((addr), (size), __pgprot(PROT_NORMAL_NC))
+#define ioremap_wt(addr, size)		__ioremap((addr), (size), __pgprot(PROT_DEVICE_nGnRE))
 #define iounmap				__iounmap
 
 /*
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 4f7310fa77f0..3c5fe685a2d6 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -27,7 +27,7 @@
 #define	MPIDR_EL1	1	/* MultiProcessor Affinity Register */
 #define	CSSELR_EL1	2	/* Cache Size Selection Register */
 #define	SCTLR_EL1	3	/* System Control Register */
-#define	ACTLR_EL1	4	/* Auxilliary Control Register */
+#define	ACTLR_EL1	4	/* Auxiliary Control Register */
 #define	CPACR_EL1	5	/* Coprocessor Access Control */
 #define	TTBR0_EL1	6	/* Translation Table Base Register 0 */
 #define	TTBR1_EL1	7	/* Translation Table Base Register 1 */
@@ -132,11 +132,6 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 
-extern char __save_vgic_v2_state[];
-extern char __restore_vgic_v2_state[];
-extern char __save_vgic_v3_state[];
-extern char __restore_vgic_v3_state[];
-
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f0f58c9beec0..2709db2a7eac 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -221,29 +221,6 @@ struct vgic_sr_vectors {
 	void	*restore_vgic;
 };
 
-static inline void vgic_arch_setup(const struct vgic_params *vgic)
-{
-	extern struct vgic_sr_vectors __vgic_sr_vectors;
-
-	switch(vgic->type)
-	{
-	case VGIC_V2:
-		__vgic_sr_vectors.save_vgic	= __save_vgic_v2_state;
-		__vgic_sr_vectors.restore_vgic	= __restore_vgic_v2_state;
-		break;
-
-#ifdef CONFIG_ARM_GIC_V3
-	case VGIC_V3:
-		__vgic_sr_vectors.save_vgic	= __save_vgic_v3_state;
-		__vgic_sr_vectors.restore_vgic	= __restore_vgic_v3_state;
-		break;
-#endif
-
-	default:
-		BUG();
-	}
-}
-
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 3d311761e3c2..79fcfb048884 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -34,5 +34,6 @@ extern void init_mem_pgprot(void);
 extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       unsigned long virt, phys_addr_t size,
 			       pgprot_t prot);
+extern void *fixmap_remap_fdt(phys_addr_t dt_phys);
 
 #endif
diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index d26d1d53c0d7..6471773db6fd 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -24,4 +24,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
 #define perf_misc_flags(regs)	perf_misc_flags(regs)
 #endif
 
+#define perf_arch_fetch_caller_regs(regs, __ip) { \
+	(regs)->pc = (__ip);    \
+	(regs)->regs[29] = (unsigned long) __builtin_frame_address(0); \
+	(regs)->sp = current_stack_pointer; \
+	(regs)->pstate = PSR_MODE_EL1h;	\
+}
+
 #endif
diff --git a/arch/arm64/include/asm/proc-fns.h b/arch/arm64/include/asm/proc-fns.h
index 220633b791b8..14ad6e4e87d1 100644
--- a/arch/arm64/include/asm/proc-fns.h
+++ b/arch/arm64/include/asm/proc-fns.h
@@ -28,12 +28,8 @@
 struct mm_struct;
 struct cpu_suspend_ctx;
 
-extern void cpu_cache_off(void);
 extern void cpu_do_idle(void);
 extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm);
-extern void cpu_reset(unsigned long addr) __attribute__((noreturn));
-void cpu_soft_restart(phys_addr_t cpu_reset,
-		unsigned long addr) __attribute__((noreturn));
 extern void cpu_do_suspend(struct cpu_suspend_ctx *ptr);
 extern u64 cpu_do_resume(phys_addr_t ptr, u64 idmap_ttbr);
 
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index d2c37a1df0eb..e4c893e54f01 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -78,13 +78,30 @@ struct cpu_context {
 
 struct thread_struct {
 	struct cpu_context	cpu_context;	/* cpu context */
-	unsigned long		tp_value;
+	unsigned long		tp_value;	/* TLS register */
+#ifdef CONFIG_COMPAT
+	unsigned long		tp2_value;
+#endif
 	struct fpsimd_state	fpsimd_state;
 	unsigned long		fault_address;	/* fault info */
 	unsigned long		fault_code;	/* ESR_EL1 value */
 	struct debug_info	debug;		/* debugging */
 };
 
+#ifdef CONFIG_COMPAT
+#define task_user_tls(t)						\
+({									\
+	unsigned long *__tls;						\
+	if (is_compat_thread(task_thread_info(t)))			\
+		__tls = &(t)->thread.tp2_value;				\
+	else								\
+		__tls = &(t)->thread.tp_value;				\
+	__tls;								\
+ })
+#else
+#define task_user_tls(t)	(&(t)->thread.tp_value)
+#endif
+
 #define INIT_THREAD  {	}
 
 static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
diff --git a/arch/arm64/include/asm/psci.h b/arch/arm64/include/asm/psci.h
index 2454bc59c916..49d7e1aaebdc 100644
--- a/arch/arm64/include/asm/psci.h
+++ b/arch/arm64/include/asm/psci.h
@@ -14,7 +14,15 @@
 #ifndef __ASM_PSCI_H
 #define __ASM_PSCI_H
 
-int psci_dt_init(void);
-int psci_acpi_init(void);
+int __init psci_dt_init(void);
+
+#ifdef CONFIG_ACPI
+int __init psci_acpi_init(void);
+bool __init acpi_psci_present(void);
+bool __init acpi_psci_use_hvc(void);
+#else
+static inline int psci_acpi_init(void) { return 0; }
+static inline bool acpi_psci_present(void) { return false; }
+#endif
 
 #endif /* __ASM_PSCI_H */
diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h
index bf22650b1a78..db02be81b90a 100644
--- a/arch/arm64/include/asm/smp.h
+++ b/arch/arm64/include/asm/smp.h
@@ -42,7 +42,7 @@ extern void handle_IPI(int ipinr, struct pt_regs *regs);
  * Discover the set of possible CPUs and determine their
  * SMP operations.
  */
-extern void of_smp_init_cpus(void);
+extern void smp_init_cpus(void);
 
 /*
  * Provide a function to raise an IPI cross call on CPUs in callmap.
diff --git a/arch/arm64/include/asm/smp_plat.h b/arch/arm64/include/asm/smp_plat.h
index 8dcd61e32176..7abf7570c00f 100644
--- a/arch/arm64/include/asm/smp_plat.h
+++ b/arch/arm64/include/asm/smp_plat.h
@@ -19,6 +19,8 @@
 #ifndef __ASM_SMP_PLAT_H
 #define __ASM_SMP_PLAT_H
 
+#include <linux/cpumask.h>
+
 #include <asm/types.h>
 
 struct mpidr_hash {
@@ -39,6 +41,20 @@ static inline u32 mpidr_hash_size(void)
  */
 extern u64 __cpu_logical_map[NR_CPUS];
 #define cpu_logical_map(cpu)    __cpu_logical_map[cpu]
+/*
+ * Retrieve logical cpu index corresponding to a given MPIDR.Aff*
+ *  - mpidr: MPIDR.Aff* bits to be used for the look-up
+ *
+ * Returns the cpu logical index or -EINVAL on look-up error
+ */
+static inline int get_logical_index(u64 mpidr)
+{
+	int cpu;
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+		if (cpu_logical_map(cpu) == mpidr)
+			return cpu;
+	return -EINVAL;
+}
 
 void __init do_post_cpus_up_work(void);
 
diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h
index 003802f58963..59a5b0f1e81c 100644
--- a/arch/arm64/include/asm/suspend.h
+++ b/arch/arm64/include/asm/suspend.h
@@ -21,6 +21,6 @@ struct sleep_save_sp {
 	phys_addr_t save_ptr_stash_phys;
 };
 
-extern int __cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
+extern int cpu_suspend(unsigned long arg, int (*fn)(unsigned long));
 extern void cpu_resume(void);
 #endif
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index 7a18fabbe0f6..57f110bea6a8 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -23,6 +23,8 @@
 #include <linux/compiler.h>
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
+#include <linux/signal.h>
+#include <linux/ratelimit.h>
 #include <linux/reboot.h>
 
 struct pt_regs;
@@ -41,9 +43,19 @@ struct mm_struct;
 extern void show_pte(struct mm_struct *mm, unsigned long addr);
 extern void __show_regs(struct pt_regs *);
 
-void soft_restart(unsigned long);
 extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 
+#define show_unhandled_signals_ratelimited()				\
+({									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+	bool __show_ratelimited = false;				\
+	if (show_unhandled_signals && __ratelimit(&_rs))		\
+		__show_ratelimited = true;				\
+	__show_ratelimited;						\
+})
+
 #define UDBG_UNDEFINED	(1 << 0)
 #define UDBG_SYSCALL	(1 << 1)
 #define UDBG_BADABORT	(1 << 2)
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index c3bb05b98616..934815d45eda 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -28,8 +28,6 @@
  *	TLB Management
  *	==============
  *
- *	The arch/arm64/mm/tlb.S files implement these methods.
- *
  *	The TLB specific code is expected to perform whatever tests it needs
  *	to determine if it should invalidate the TLB for each call.  Start
  *	addresses are inclusive and end addresses are exclusive; it is safe to
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index 7ebcd31ce51c..225ec3524fbf 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -18,7 +18,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
 #define topology_physical_package_id(cpu)	(cpu_topology[cpu].cluster_id)
 #define topology_core_id(cpu)		(cpu_topology[cpu].core_id)
 #define topology_core_cpumask(cpu)	(&cpu_topology[cpu].core_sibling)
-#define topology_thread_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
+#define topology_sibling_cpumask(cpu)	(&cpu_topology[cpu].thread_sibling)
 
 void init_cpu_topology(void);
 void store_cpu_topology(unsigned int cpuid);
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 8b839558838e..19de7537e7d3 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -36,12 +36,6 @@ EXPORT_SYMBOL(acpi_disabled);
 int acpi_pci_disabled = 1;	/* skip ACPI PCI scan and IRQ initialization */
 EXPORT_SYMBOL(acpi_pci_disabled);
 
-/* Processors with enabled flag and sane MPIDR */
-static int enabled_cpus;
-
-/* Boot CPU is valid or not in MADT */
-static bool bootcpu_valid  __initdata;
-
 static bool param_acpi_off __initdata;
 static bool param_acpi_force __initdata;
 
@@ -95,122 +89,15 @@ void __init __acpi_unmap_table(char *map, unsigned long size)
 	early_memunmap(map, size);
 }
 
-/**
- * acpi_map_gic_cpu_interface - generates a logical cpu number
- * and map to MPIDR represented by GICC structure
- */
-static void __init
-acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
+bool __init acpi_psci_present(void)
 {
-	int i;
-	u64 mpidr = processor->arm_mpidr & MPIDR_HWID_BITMASK;
-	bool enabled = !!(processor->flags & ACPI_MADT_ENABLED);
-
-	if (mpidr == INVALID_HWID) {
-		pr_info("Skip MADT cpu entry with invalid MPIDR\n");
-		return;
-	}
-
-	total_cpus++;
-	if (!enabled)
-		return;
-
-	if (enabled_cpus >=  NR_CPUS) {
-		pr_warn("NR_CPUS limit of %d reached, Processor %d/0x%llx ignored.\n",
-			NR_CPUS, total_cpus, mpidr);
-		return;
-	}
-
-	/* Check if GICC structure of boot CPU is available in the MADT */
-	if (cpu_logical_map(0) == mpidr) {
-		if (bootcpu_valid) {
-			pr_err("Firmware bug, duplicate CPU MPIDR: 0x%llx in MADT\n",
-			       mpidr);
-			return;
-		}
-
-		bootcpu_valid = true;
-	}
-
-	/*
-	 * Duplicate MPIDRs are a recipe for disaster. Scan
-	 * all initialized entries and check for
-	 * duplicates. If any is found just ignore the CPU.
-	 */
-	for (i = 1; i < enabled_cpus; i++) {
-		if (cpu_logical_map(i) == mpidr) {
-			pr_err("Firmware bug, duplicate CPU MPIDR: 0x%llx in MADT\n",
-			       mpidr);
-			return;
-		}
-	}
-
-	if (!acpi_psci_present())
-		return;
-
-	cpu_ops[enabled_cpus] = cpu_get_ops("psci");
-	/* CPU 0 was already initialized */
-	if (enabled_cpus) {
-		if (!cpu_ops[enabled_cpus])
-			return;
-
-		if (cpu_ops[enabled_cpus]->cpu_init(NULL, enabled_cpus))
-			return;
-
-		/* map the logical cpu id to cpu MPIDR */
-		cpu_logical_map(enabled_cpus) = mpidr;
-	}
-
-	enabled_cpus++;
+	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_COMPLIANT;
 }
 
-static int __init
-acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
-				const unsigned long end)
+/* Whether HVC must be used instead of SMC as the PSCI conduit */
+bool __init acpi_psci_use_hvc(void)
 {
-	struct acpi_madt_generic_interrupt *processor;
-
-	processor = (struct acpi_madt_generic_interrupt *)header;
-
-	if (BAD_MADT_ENTRY(processor, end))
-		return -EINVAL;
-
-	acpi_table_print_madt_entry(header);
-	acpi_map_gic_cpu_interface(processor);
-	return 0;
-}
-
-/* Parse GIC cpu interface entries in MADT for SMP init */
-void __init acpi_init_cpus(void)
-{
-	int count, i;
-
-	/*
-	 * do a partial walk of MADT to determine how many CPUs
-	 * we have including disabled CPUs, and get information
-	 * we need for SMP init
-	 */
-	count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
-			acpi_parse_gic_cpu_interface, 0);
-
-	if (!count) {
-		pr_err("No GIC CPU interface entries present\n");
-		return;
-	} else if (count < 0) {
-		pr_err("Error parsing GIC CPU interface entry\n");
-		return;
-	}
-
-	if (!bootcpu_valid) {
-		pr_err("MADT missing boot CPU MPIDR, not enabling secondaries\n");
-		return;
-	}
-
-	for (i = 0; i < enabled_cpus; i++)
-		set_cpu_possible(i, true);
-
-	/* Make boot-up look pretty */
-	pr_info("%d CPUs enabled, %d CPUs total\n", enabled_cpus, total_cpus);
+	return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC;
 }
 
 /*
diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c
index 28f8365edc4c..221b98312f0c 100644
--- a/arch/arm64/kernel/alternative.c
+++ b/arch/arm64/kernel/alternative.c
@@ -24,8 +24,13 @@
 #include <asm/cacheflush.h>
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
+#include <asm/insn.h>
 #include <linux/stop_machine.h>
 
+#define __ALT_PTR(a,f)		(u32 *)((void *)&(a)->f + (a)->f)
+#define ALT_ORIG_PTR(a)		__ALT_PTR(a, orig_offset)
+#define ALT_REPL_PTR(a)		__ALT_PTR(a, alt_offset)
+
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 
 struct alt_region {
@@ -33,13 +38,63 @@ struct alt_region {
 	struct alt_instr *end;
 };
 
+/*
+ * Check if the target PC is within an alternative block.
+ */
+static bool branch_insn_requires_update(struct alt_instr *alt, unsigned long pc)
+{
+	unsigned long replptr;
+
+	if (kernel_text_address(pc))
+		return 1;
+
+	replptr = (unsigned long)ALT_REPL_PTR(alt);
+	if (pc >= replptr && pc <= (replptr + alt->alt_len))
+		return 0;
+
+	/*
+	 * Branching into *another* alternate sequence is doomed, and
+	 * we're not even trying to fix it up.
+	 */
+	BUG();
+}
+
+static u32 get_alt_insn(struct alt_instr *alt, u32 *insnptr, u32 *altinsnptr)
+{
+	u32 insn;
+
+	insn = le32_to_cpu(*altinsnptr);
+
+	if (aarch64_insn_is_branch_imm(insn)) {
+		s32 offset = aarch64_get_branch_offset(insn);
+		unsigned long target;
+
+		target = (unsigned long)altinsnptr + offset;
+
+		/*
+		 * If we're branching inside the alternate sequence,
+		 * do not rewrite the instruction, as it is already
+		 * correct. Otherwise, generate the new instruction.
+		 */
+		if (branch_insn_requires_update(alt, target)) {
+			offset = target - (unsigned long)insnptr;
+			insn = aarch64_set_branch_offset(insn, offset);
+		}
+	}
+
+	return insn;
+}
+
 static int __apply_alternatives(void *alt_region)
 {
 	struct alt_instr *alt;
 	struct alt_region *region = alt_region;
-	u8 *origptr, *replptr;
+	u32 *origptr, *replptr;
 
 	for (alt = region->begin; alt < region->end; alt++) {
+		u32 insn;
+		int i, nr_inst;
+
 		if (!cpus_have_cap(alt->cpufeature))
 			continue;
 
@@ -47,11 +102,17 @@ static int __apply_alternatives(void *alt_region)
 
 		pr_info_once("patching kernel code\n");
 
-		origptr = (u8 *)&alt->orig_offset + alt->orig_offset;
-		replptr = (u8 *)&alt->alt_offset + alt->alt_offset;
-		memcpy(origptr, replptr, alt->alt_len);
+		origptr = ALT_ORIG_PTR(alt);
+		replptr = ALT_REPL_PTR(alt);
+		nr_inst = alt->alt_len / sizeof(insn);
+
+		for (i = 0; i < nr_inst; i++) {
+			insn = get_alt_insn(alt, origptr + i, replptr + i);
+			*(origptr + i) = cpu_to_le32(insn);
+		}
+
 		flush_icache_range((uintptr_t)origptr,
-				   (uintptr_t)(origptr + alt->alt_len));
+				   (uintptr_t)(origptr + nr_inst));
 	}
 
 	return 0;
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index da675cc5dfae..c99701a34d7b 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -127,7 +127,6 @@ int main(void)
   DEFINE(VCPU_VGIC_CPU,		offsetof(struct kvm_vcpu, arch.vgic_cpu));
   DEFINE(VGIC_SAVE_FN,		offsetof(struct vgic_sr_vectors, save_vgic));
   DEFINE(VGIC_RESTORE_FN,	offsetof(struct vgic_sr_vectors, restore_vgic));
-  DEFINE(VGIC_SR_VECTOR_SZ,	sizeof(struct vgic_sr_vectors));
   DEFINE(VGIC_V2_CPU_HCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_VMCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
   DEFINE(VGIC_V2_CPU_MISR,	offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c
index fb8ff9ba467a..5ea337dd2f15 100644
--- a/arch/arm64/kernel/cpu_ops.c
+++ b/arch/arm64/kernel/cpu_ops.c
@@ -16,11 +16,13 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <asm/cpu_ops.h>
-#include <asm/smp_plat.h>
+#include <linux/acpi.h>
 #include <linux/errno.h>
 #include <linux/of.h>
 #include <linux/string.h>
+#include <asm/acpi.h>
+#include <asm/cpu_ops.h>
+#include <asm/smp_plat.h>
 
 extern const struct cpu_operations smp_spin_table_ops;
 extern const struct cpu_operations cpu_psci_ops;
@@ -35,7 +37,7 @@ static const struct cpu_operations *supported_cpu_ops[] __initconst = {
 	NULL,
 };
 
-const struct cpu_operations * __init cpu_get_ops(const char *name)
+static const struct cpu_operations * __init cpu_get_ops(const char *name)
 {
 	const struct cpu_operations **ops = supported_cpu_ops;
 
@@ -49,39 +51,53 @@ const struct cpu_operations * __init cpu_get_ops(const char *name)
 	return NULL;
 }
 
+static const char *__init cpu_read_enable_method(int cpu)
+{
+	const char *enable_method;
+
+	if (acpi_disabled) {
+		struct device_node *dn = of_get_cpu_node(cpu, NULL);
+
+		if (!dn) {
+			if (!cpu)
+				pr_err("Failed to find device node for boot cpu\n");
+			return NULL;
+		}
+
+		enable_method = of_get_property(dn, "enable-method", NULL);
+		if (!enable_method) {
+			/*
+			 * The boot CPU may not have an enable method (e.g.
+			 * when spin-table is used for secondaries).
+			 * Don't warn spuriously.
+			 */
+			if (cpu != 0)
+				pr_err("%s: missing enable-method property\n",
+					dn->full_name);
+		}
+	} else {
+		enable_method = acpi_get_enable_method(cpu);
+		if (!enable_method)
+			pr_err("Unsupported ACPI enable-method\n");
+	}
+
+	return enable_method;
+}
 /*
- * Read a cpu's enable method from the device tree and record it in cpu_ops.
+ * Read a cpu's enable method and record it in cpu_ops.
  */
-int __init cpu_read_ops(struct device_node *dn, int cpu)
+int __init cpu_read_ops(int cpu)
 {
-	const char *enable_method = of_get_property(dn, "enable-method", NULL);
-	if (!enable_method) {
-		/*
-		 * The boot CPU may not have an enable method (e.g. when
-		 * spin-table is used for secondaries). Don't warn spuriously.
-		 */
-		if (cpu != 0)
-			pr_err("%s: missing enable-method property\n",
-				dn->full_name);
-		return -ENOENT;
-	}
+	const char *enable_method = cpu_read_enable_method(cpu);
+
+	if (!enable_method)
+		return -ENODEV;
 
 	cpu_ops[cpu] = cpu_get_ops(enable_method);
 	if (!cpu_ops[cpu]) {
-		pr_warn("%s: unsupported enable-method property: %s\n",
-			dn->full_name, enable_method);
+		pr_warn("Unsupported enable-method: %s\n", enable_method);
 		return -EOPNOTSUPP;
 	}
 
 	return 0;
 }
-
-void __init cpu_read_bootcpu_ops(void)
-{
-	struct device_node *dn = of_get_cpu_node(0, NULL);
-	if (!dn) {
-		pr_err("Failed to find device node for boot cpu\n");
-		return;
-	}
-	cpu_read_ops(dn, 0);
-}
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 3d9967e43d89..5ad86ceac010 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -22,7 +22,23 @@
 #include <asm/cpu.h>
 #include <asm/cpufeature.h>
 
+static bool
+has_id_aa64pfr0_feature(const struct arm64_cpu_capabilities *entry)
+{
+	u64 val;
+
+	val = read_cpuid(id_aa64pfr0_el1);
+	return (val & entry->register_mask) == entry->register_value;
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
+	{
+		.desc = "GIC system register CPU interface",
+		.capability = ARM64_HAS_SYSREG_GIC_CPUIF,
+		.matches = has_id_aa64pfr0_feature,
+		.register_mask = (0xf << 24),
+		.register_value = (1 << 24),
+	},
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c
index a78143a5c99f..7ce589ca54a4 100644
--- a/arch/arm64/kernel/cpuidle.c
+++ b/arch/arm64/kernel/cpuidle.c
@@ -18,15 +18,10 @@
 int arm_cpuidle_init(unsigned int cpu)
 {
 	int ret = -EOPNOTSUPP;
-	struct device_node *cpu_node = of_cpu_device_node_get(cpu);
-
-	if (!cpu_node)
-		return -ENODEV;
 
 	if (cpu_ops[cpu] && cpu_ops[cpu]->cpu_init_idle)
-		ret = cpu_ops[cpu]->cpu_init_idle(cpu_node, cpu);
+		ret = cpu_ops[cpu]->cpu_init_idle(cpu);
 
-	of_node_put(cpu_node);
 	return ret;
 }
 
@@ -37,7 +32,7 @@ int arm_cpuidle_init(unsigned int cpu)
  * Return: 0 on success, -EOPNOTSUPP if CPU suspend hook not initialized, CPU
  * operations back-end error code otherwise.
  */
-int cpu_suspend(unsigned long arg)
+int arm_cpuidle_suspend(int index)
 {
 	int cpu = smp_processor_id();
 
@@ -47,5 +42,5 @@ int cpu_suspend(unsigned long arg)
 	 */
 	if (!cpu_ops[cpu] || !cpu_ops[cpu]->cpu_suspend)
 		return -EOPNOTSUPP;
-	return cpu_ops[cpu]->cpu_suspend(arg);
+	return cpu_ops[cpu]->cpu_suspend(index);
 }
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 959fe8733560..a7691a378668 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -21,7 +21,7 @@
 #include <linux/init.h>
 #include <linux/linkage.h>
 
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/cpufeature.h>
@@ -124,21 +124,24 @@
 	msr	sp_el0, x23
 
 #ifdef CONFIG_ARM64_ERRATUM_845719
-	alternative_insn						\
-	"nop",								\
-	"tbz x22, #4, 1f",						\
-	ARM64_WORKAROUND_845719
+
+#undef SEQUENCE_ORG
+#undef SEQUENCE_ALT
+
 #ifdef CONFIG_PID_IN_CONTEXTIDR
-	alternative_insn						\
-	"nop; nop",							\
-	"mrs x29, contextidr_el1; msr contextidr_el1, x29; 1:",		\
-	ARM64_WORKAROUND_845719
+
+#define SEQUENCE_ORG	"nop ; nop ; nop"
+#define SEQUENCE_ALT	"tbz x22, #4, 1f ; mrs x29, contextidr_el1; msr contextidr_el1, x29; 1:"
+
 #else
-	alternative_insn						\
-	"nop",								\
-	"msr contextidr_el1, xzr; 1:",					\
-	ARM64_WORKAROUND_845719
+
+#define SEQUENCE_ORG	"nop ; nop"
+#define SEQUENCE_ALT	"tbz x22, #4, 1f ; msr contextidr_el1, xzr; 1:"
+
 #endif
+
+	alternative_insn SEQUENCE_ORG, SEQUENCE_ALT, ARM64_WORKAROUND_845719
+
 #endif
 	.endif
 	msr	elr_el1, x21			// set up the return data
@@ -517,6 +520,7 @@ el0_sp_pc:
 	mrs	x26, far_el1
 	// enable interrupts before calling the main handler
 	enable_dbg_and_irq
+	ct_user_exit
 	mov	x0, x26
 	mov	x1, x25
 	mov	x2, sp
@@ -608,11 +612,16 @@ ENDPROC(cpu_switch_to)
  */
 ret_fast_syscall:
 	disable_irq				// disable interrupts
-	ldr	x1, [tsk, #TI_FLAGS]
+	ldr	x1, [tsk, #TI_FLAGS]		// re-check for syscall tracing
+	and	x2, x1, #_TIF_SYSCALL_WORK
+	cbnz	x2, ret_fast_syscall_trace
 	and	x2, x1, #_TIF_WORK_MASK
 	cbnz	x2, fast_work_pending
 	enable_step_tsk x1, x2
 	kernel_exit 0, ret = 1
+ret_fast_syscall_trace:
+	enable_irq				// enable interrupts
+	b	__sys_trace_return
 
 /*
  * Ok, we need to do extra processing, enter the slow path.
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 3dca15634e69..44d6f7545505 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/cpu.h>
 #include <linux/cpu_pm.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -296,6 +297,35 @@ static void fpsimd_pm_init(void)
 static inline void fpsimd_pm_init(void) { }
 #endif /* CONFIG_CPU_PM */
 
+#ifdef CONFIG_HOTPLUG_CPU
+static int fpsimd_cpu_hotplug_notifier(struct notifier_block *nfb,
+				       unsigned long action,
+				       void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		per_cpu(fpsimd_last_state, cpu) = NULL;
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block fpsimd_cpu_hotplug_notifier_block = {
+	.notifier_call = fpsimd_cpu_hotplug_notifier,
+};
+
+static inline void fpsimd_hotplug_init(void)
+{
+	register_cpu_notifier(&fpsimd_cpu_hotplug_notifier_block);
+}
+
+#else
+static inline void fpsimd_hotplug_init(void) { }
+#endif
+
 /*
  * FP/SIMD support code initialisation.
  */
@@ -315,6 +345,7 @@ static int __init fpsimd_init(void)
 		elf_hwcap |= HWCAP_ASIMD;
 
 	fpsimd_pm_init();
+	fpsimd_hotplug_init();
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 19f915e8f6e0..c0ff3ce4299e 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -237,8 +237,6 @@ ENTRY(stext)
 	bl	el2_setup			// Drop to EL1, w20=cpu_boot_mode
 	adrp	x24, __PHYS_OFFSET
 	bl	set_cpu_boot_mode_flag
-
-	bl	__vet_fdt
 	bl	__create_page_tables		// x25=TTBR0, x26=TTBR1
 	/*
 	 * The following calls CPU setup code, see arch/arm64/mm/proc.S for
@@ -270,24 +268,6 @@ preserve_boot_args:
 ENDPROC(preserve_boot_args)
 
 /*
- * Determine validity of the x21 FDT pointer.
- * The dtb must be 8-byte aligned and live in the first 512M of memory.
- */
-__vet_fdt:
-	tst	x21, #0x7
-	b.ne	1f
-	cmp	x21, x24
-	b.lt	1f
-	mov	x0, #(1 << 29)
-	add	x0, x0, x24
-	cmp	x21, x0
-	b.ge	1f
-	ret
-1:
-	mov	x21, #0
-	ret
-ENDPROC(__vet_fdt)
-/*
  * Macro to create a table entry to the next page.
  *
  *	tbl:	page table address
@@ -348,8 +328,7 @@ ENDPROC(__vet_fdt)
  * required to get the kernel running. The following sections are required:
  *   - identity mapping to enable the MMU (low address, TTBR0)
  *   - first few MB of the kernel linear mapping to jump to once the MMU has
- *     been enabled, including the FDT blob (TTBR1)
- *   - pgd entry for fixed mappings (TTBR1)
+ *     been enabled
  */
 __create_page_tables:
 	adrp	x25, idmap_pg_dir
@@ -382,7 +361,7 @@ __create_page_tables:
 	 * Create the identity mapping.
 	 */
 	mov	x0, x25				// idmap_pg_dir
-	adrp	x3, KERNEL_START		// __pa(KERNEL_START)
+	adrp	x3, __idmap_text_start		// __pa(__idmap_text_start)
 
 #ifndef CONFIG_ARM64_VA_BITS_48
 #define EXTRA_SHIFT	(PGDIR_SHIFT + PAGE_SHIFT - 3)
@@ -405,11 +384,11 @@ __create_page_tables:
 
 	/*
 	 * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the
-	 * entire kernel image can be ID mapped. As T0SZ == (64 - #bits used),
+	 * entire ID map region can be mapped. As T0SZ == (64 - #bits used),
 	 * this number conveniently equals the number of leading zeroes in
-	 * the physical address of KERNEL_END.
+	 * the physical address of __idmap_text_end.
 	 */
-	adrp	x5, KERNEL_END
+	adrp	x5, __idmap_text_end
 	clz	x5, x5
 	cmp	x5, TCR_T0SZ(VA_BITS)	// default T0SZ small enough?
 	b.ge	1f			// .. then skip additional level
@@ -424,8 +403,8 @@ __create_page_tables:
 #endif
 
 	create_pgd_entry x0, x3, x5, x6
-	mov	x5, x3				// __pa(KERNEL_START)
-	adr_l	x6, KERNEL_END			// __pa(KERNEL_END)
+	mov	x5, x3				// __pa(__idmap_text_start)
+	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
 	create_block_map x0, x7, x3, x5, x6
 
 	/*
@@ -439,22 +418,6 @@ __create_page_tables:
 	create_block_map x0, x7, x3, x5, x6
 
 	/*
-	 * Map the FDT blob (maximum 2MB; must be within 512MB of
-	 * PHYS_OFFSET).
-	 */
-	mov	x3, x21				// FDT phys address
-	and	x3, x3, #~((1 << 21) - 1)	// 2MB aligned
-	mov	x6, #PAGE_OFFSET
-	sub	x5, x3, x24			// subtract PHYS_OFFSET
-	tst	x5, #~((1 << 29) - 1)		// within 512MB?
-	csel	x21, xzr, x21, ne		// zero the FDT pointer
-	b.ne	1f
-	add	x5, x5, x6			// __va(FDT blob)
-	add	x6, x5, #1 << 21		// 2MB for the FDT blob
-	sub	x6, x6, #1			// inclusive range
-	create_block_map x0, x7, x3, x5, x6
-1:
-	/*
 	 * Since the page tables have been populated with non-cacheable
 	 * accesses (MMU disabled), invalidate the idmap and swapper page
 	 * tables again to remove any speculatively loaded cache lines.
@@ -669,6 +632,7 @@ ENDPROC(__secondary_switched)
  *
  * other registers depend on the function called upon completion
  */
+	.section	".idmap.text", "ax"
 __enable_mmu:
 	ldr	x5, =vectors
 	msr	vbar_el1, x5
diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c
index 924902083e47..dd9671cd0bb2 100644
--- a/arch/arm64/kernel/insn.c
+++ b/arch/arm64/kernel/insn.c
@@ -77,6 +77,14 @@ bool __kprobes aarch64_insn_is_nop(u32 insn)
 	}
 }
 
+bool aarch64_insn_is_branch_imm(u32 insn)
+{
+	return (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn) ||
+		aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn) ||
+		aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+		aarch64_insn_is_bcond(insn));
+}
+
 static DEFINE_SPINLOCK(patch_lock);
 
 static void __kprobes *patch_map(void *addr, int fixmap)
@@ -1057,6 +1065,58 @@ u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
 	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
 }
 
+/*
+ * Decode the imm field of a branch, and return the byte offset as a
+ * signed value (so it can be used when computing a new branch
+ * target).
+ */
+s32 aarch64_get_branch_offset(u32 insn)
+{
+	s32 imm;
+
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
+		return (imm << 6) >> 4;
+	}
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
+		return (imm << 13) >> 11;
+	}
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
+		imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
+		return (imm << 18) >> 16;
+	}
+
+	/* Unhandled instruction */
+	BUG();
+}
+
+/*
+ * Encode the displacement of a branch in the imm field and return the
+ * updated instruction.
+ */
+u32 aarch64_set_branch_offset(u32 insn, s32 offset)
+{
+	if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
+						     offset >> 2);
+
+	if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
+	    aarch64_insn_is_bcond(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
+						     offset >> 2);
+
+	if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
+		return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
+						     offset >> 2);
+
+	/* Unhandled instruction */
+	BUG();
+}
+
 bool aarch32_insn_is_wide(u32 insn)
 {
 	return insn >= 0xe800;
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index cce18c85d2e8..702591f6180a 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -488,7 +488,7 @@ armpmu_reserve_hardware(struct arm_pmu *armpmu)
 			}
 
 			err = request_irq(irq, armpmu->handle_irq,
-					IRQF_NOBALANCING,
+					IRQF_NOBALANCING | IRQF_NO_THREAD,
 					"arm-pmu", armpmu);
 			if (err) {
 				pr_err("unable to request IRQ%d for ARM PMU counters\n",
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index c6b1f3b96f45..223b093c9440 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -58,14 +58,6 @@ unsigned long __stack_chk_guard __read_mostly;
 EXPORT_SYMBOL(__stack_chk_guard);
 #endif
 
-void soft_restart(unsigned long addr)
-{
-	setup_mm_for_reboot();
-	cpu_soft_restart(virt_to_phys(cpu_reset), addr);
-	/* Should never get here */
-	BUG();
-}
-
 /*
  * Function pointers to optional machine specific functions
  */
@@ -136,9 +128,7 @@ void machine_power_off(void)
 
 /*
  * Restart requires that the secondary CPUs stop performing any activity
- * while the primary CPU resets the system. Systems with a single CPU can
- * use soft_restart() as their machine descriptor's .restart hook, since that
- * will cause the only available CPU to reset. Systems with multiple CPUs must
+ * while the primary CPU resets the system. Systems with multiple CPUs must
  * provide a HW restart implementation, to ensure that all CPUs reset at once.
  * This is required so that any code running after reset on the primary CPU
  * doesn't have to co-ordinate with other CPUs to ensure they aren't still
@@ -243,7 +233,8 @@ void release_thread(struct task_struct *dead_task)
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-	fpsimd_preserve_current_state();
+	if (current->mm)
+		fpsimd_preserve_current_state();
 	*dst = *src;
 	return 0;
 }
@@ -254,35 +245,35 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 		unsigned long stk_sz, struct task_struct *p)
 {
 	struct pt_regs *childregs = task_pt_regs(p);
-	unsigned long tls = p->thread.tp_value;
 
 	memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
 
 	if (likely(!(p->flags & PF_KTHREAD))) {
 		*childregs = *current_pt_regs();
 		childregs->regs[0] = 0;
-		if (is_compat_thread(task_thread_info(p))) {
-			if (stack_start)
+
+		/*
+		 * Read the current TLS pointer from tpidr_el0 as it may be
+		 * out-of-sync with the saved value.
+		 */
+		asm("mrs %0, tpidr_el0" : "=r" (*task_user_tls(p)));
+
+		if (stack_start) {
+			if (is_compat_thread(task_thread_info(p)))
 				childregs->compat_sp = stack_start;
-		} else {
-			/*
-			 * Read the current TLS pointer from tpidr_el0 as it may be
-			 * out-of-sync with the saved value.
-			 */
-			asm("mrs %0, tpidr_el0" : "=r" (tls));
-			if (stack_start) {
-				/* 16-byte aligned stack mandatory on AArch64 */
-				if (stack_start & 15)
-					return -EINVAL;
+			/* 16-byte aligned stack mandatory on AArch64 */
+			else if (stack_start & 15)
+				return -EINVAL;
+			else
 				childregs->sp = stack_start;
-			}
 		}
+
 		/*
 		 * If a TLS pointer was passed to clone (4th argument), use it
 		 * for the new thread.
 		 */
 		if (clone_flags & CLONE_SETTLS)
-			tls = childregs->regs[3];
+			p->thread.tp_value = childregs->regs[3];
 	} else {
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->pstate = PSR_MODE_EL1h;
@@ -291,7 +282,6 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	}
 	p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
 	p->thread.cpu_context.sp = (unsigned long)childregs;
-	p->thread.tp_value = tls;
 
 	ptrace_hw_copy_thread(p);
 
@@ -302,18 +292,12 @@ static void tls_thread_switch(struct task_struct *next)
 {
 	unsigned long tpidr, tpidrro;
 
-	if (!is_compat_task()) {
-		asm("mrs %0, tpidr_el0" : "=r" (tpidr));
-		current->thread.tp_value = tpidr;
-	}
+	asm("mrs %0, tpidr_el0" : "=r" (tpidr));
+	*task_user_tls(current) = tpidr;
 
-	if (is_compat_thread(task_thread_info(next))) {
-		tpidr = 0;
-		tpidrro = next->thread.tp_value;
-	} else {
-		tpidr = next->thread.tp_value;
-		tpidrro = 0;
-	}
+	tpidr = *task_user_tls(next);
+	tpidrro = is_compat_thread(task_thread_info(next)) ?
+		  next->thread.tp_value : 0;
 
 	asm(
 	"	msr	tpidr_el0, %0\n"
diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c
index ea18cb53921e..869f202748e8 100644
--- a/arch/arm64/kernel/psci.c
+++ b/arch/arm64/kernel/psci.c
@@ -15,7 +15,6 @@
 
 #define pr_fmt(fmt) "psci: " fmt
 
-#include <linux/acpi.h>
 #include <linux/init.h>
 #include <linux/of.h>
 #include <linux/smp.h>
@@ -25,8 +24,8 @@
 #include <linux/slab.h>
 #include <uapi/linux/psci.h>
 
-#include <asm/acpi.h>
 #include <asm/compiler.h>
+#include <asm/cputype.h>
 #include <asm/cpu_ops.h>
 #include <asm/errno.h>
 #include <asm/psci.h>
@@ -37,16 +36,31 @@
 #define PSCI_POWER_STATE_TYPE_STANDBY		0
 #define PSCI_POWER_STATE_TYPE_POWER_DOWN	1
 
-struct psci_power_state {
-	u16	id;
-	u8	type;
-	u8	affinity_level;
-};
+static bool psci_power_state_loses_context(u32 state)
+{
+	return state & PSCI_0_2_POWER_STATE_TYPE_MASK;
+}
+
+static bool psci_power_state_is_valid(u32 state)
+{
+	const u32 valid_mask = PSCI_0_2_POWER_STATE_ID_MASK |
+			       PSCI_0_2_POWER_STATE_TYPE_MASK |
+			       PSCI_0_2_POWER_STATE_AFFL_MASK;
+
+	return !(state & ~valid_mask);
+}
+
+/*
+ * The CPU any Trusted OS is resident on. The trusted OS may reject CPU_OFF
+ * calls to its resident CPU, so we must avoid issuing those. We never migrate
+ * a Trusted OS even if it claims to be capable of migration -- doing so will
+ * require cooperation with a Trusted OS driver.
+ */
+static int resident_cpu = -1;
 
 struct psci_operations {
-	int (*cpu_suspend)(struct psci_power_state state,
-			   unsigned long entry_point);
-	int (*cpu_off)(struct psci_power_state state);
+	int (*cpu_suspend)(u32 state, unsigned long entry_point);
+	int (*cpu_off)(u32 state);
 	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
 	int (*migrate)(unsigned long cpuid);
 	int (*affinity_info)(unsigned long target_affinity,
@@ -56,23 +70,21 @@ struct psci_operations {
 
 static struct psci_operations psci_ops;
 
-static int (*invoke_psci_fn)(u64, u64, u64, u64);
-typedef int (*psci_initcall_t)(const struct device_node *);
-
-asmlinkage int __invoke_psci_fn_hvc(u64, u64, u64, u64);
-asmlinkage int __invoke_psci_fn_smc(u64, u64, u64, u64);
+typedef unsigned long (psci_fn)(unsigned long, unsigned long,
+				unsigned long, unsigned long);
+asmlinkage psci_fn __invoke_psci_fn_hvc;
+asmlinkage psci_fn __invoke_psci_fn_smc;
+static psci_fn *invoke_psci_fn;
 
 enum psci_function {
 	PSCI_FN_CPU_SUSPEND,
 	PSCI_FN_CPU_ON,
 	PSCI_FN_CPU_OFF,
 	PSCI_FN_MIGRATE,
-	PSCI_FN_AFFINITY_INFO,
-	PSCI_FN_MIGRATE_INFO_TYPE,
 	PSCI_FN_MAX,
 };
 
-static DEFINE_PER_CPU_READ_MOSTLY(struct psci_power_state *, psci_power_state);
+static DEFINE_PER_CPU_READ_MOSTLY(u32 *, psci_power_state);
 
 static u32 psci_function_id[PSCI_FN_MAX];
 
@@ -92,56 +104,28 @@ static int psci_to_linux_errno(int errno)
 	return -EINVAL;
 }
 
-static u32 psci_power_state_pack(struct psci_power_state state)
-{
-	return ((state.id << PSCI_0_2_POWER_STATE_ID_SHIFT)
-			& PSCI_0_2_POWER_STATE_ID_MASK) |
-		((state.type << PSCI_0_2_POWER_STATE_TYPE_SHIFT)
-		 & PSCI_0_2_POWER_STATE_TYPE_MASK) |
-		((state.affinity_level << PSCI_0_2_POWER_STATE_AFFL_SHIFT)
-		 & PSCI_0_2_POWER_STATE_AFFL_MASK);
-}
-
-static void psci_power_state_unpack(u32 power_state,
-				    struct psci_power_state *state)
-{
-	state->id = (power_state & PSCI_0_2_POWER_STATE_ID_MASK) >>
-			PSCI_0_2_POWER_STATE_ID_SHIFT;
-	state->type = (power_state & PSCI_0_2_POWER_STATE_TYPE_MASK) >>
-			PSCI_0_2_POWER_STATE_TYPE_SHIFT;
-	state->affinity_level =
-			(power_state & PSCI_0_2_POWER_STATE_AFFL_MASK) >>
-			PSCI_0_2_POWER_STATE_AFFL_SHIFT;
-}
-
-static int psci_get_version(void)
+static u32 psci_get_version(void)
 {
-	int err;
-
-	err = invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
-	return err;
+	return invoke_psci_fn(PSCI_0_2_FN_PSCI_VERSION, 0, 0, 0);
 }
 
-static int psci_cpu_suspend(struct psci_power_state state,
-			    unsigned long entry_point)
+static int psci_cpu_suspend(u32 state, unsigned long entry_point)
 {
 	int err;
-	u32 fn, power_state;
+	u32 fn;
 
 	fn = psci_function_id[PSCI_FN_CPU_SUSPEND];
-	power_state = psci_power_state_pack(state);
-	err = invoke_psci_fn(fn, power_state, entry_point, 0);
+	err = invoke_psci_fn(fn, state, entry_point, 0);
 	return psci_to_linux_errno(err);
 }
 
-static int psci_cpu_off(struct psci_power_state state)
+static int psci_cpu_off(u32 state)
 {
 	int err;
-	u32 fn, power_state;
+	u32 fn;
 
 	fn = psci_function_id[PSCI_FN_CPU_OFF];
-	power_state = psci_power_state_pack(state);
-	err = invoke_psci_fn(fn, power_state, 0, 0);
+	err = invoke_psci_fn(fn, state, 0, 0);
 	return psci_to_linux_errno(err);
 }
 
@@ -168,30 +152,29 @@ static int psci_migrate(unsigned long cpuid)
 static int psci_affinity_info(unsigned long target_affinity,
 		unsigned long lowest_affinity_level)
 {
-	int err;
-	u32 fn;
-
-	fn = psci_function_id[PSCI_FN_AFFINITY_INFO];
-	err = invoke_psci_fn(fn, target_affinity, lowest_affinity_level, 0);
-	return err;
+	return invoke_psci_fn(PSCI_0_2_FN64_AFFINITY_INFO, target_affinity,
+			      lowest_affinity_level, 0);
 }
 
 static int psci_migrate_info_type(void)
 {
-	int err;
-	u32 fn;
+	return invoke_psci_fn(PSCI_0_2_FN_MIGRATE_INFO_TYPE, 0, 0, 0);
+}
 
-	fn = psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE];
-	err = invoke_psci_fn(fn, 0, 0, 0);
-	return err;
+static unsigned long psci_migrate_info_up_cpu(void)
+{
+	return invoke_psci_fn(PSCI_0_2_FN64_MIGRATE_INFO_UP_CPU, 0, 0, 0);
 }
 
-static int __maybe_unused cpu_psci_cpu_init_idle(struct device_node *cpu_node,
-						 unsigned int cpu)
+static int __maybe_unused cpu_psci_cpu_init_idle(unsigned int cpu)
 {
 	int i, ret, count = 0;
-	struct psci_power_state *psci_states;
-	struct device_node *state_node;
+	u32 *psci_states;
+	struct device_node *state_node, *cpu_node;
+
+	cpu_node = of_get_cpu_node(cpu, NULL);
+	if (!cpu_node)
+		return -ENODEV;
 
 	/*
 	 * If the PSCI cpu_suspend function hook has not been initialized
@@ -215,13 +198,13 @@ static int __maybe_unused cpu_psci_cpu_init_idle(struct device_node *cpu_node,
 		return -ENOMEM;
 
 	for (i = 0; i < count; i++) {
-		u32 psci_power_state;
+		u32 state;
 
 		state_node = of_parse_phandle(cpu_node, "cpu-idle-states", i);
 
 		ret = of_property_read_u32(state_node,
 					   "arm,psci-suspend-param",
-					   &psci_power_state);
+					   &state);
 		if (ret) {
 			pr_warn(" * %s missing arm,psci-suspend-param property\n",
 				state_node->full_name);
@@ -230,9 +213,13 @@ static int __maybe_unused cpu_psci_cpu_init_idle(struct device_node *cpu_node,
 		}
 
 		of_node_put(state_node);
-		pr_debug("psci-power-state %#x index %d\n", psci_power_state,
-							    i);
-		psci_power_state_unpack(psci_power_state, &psci_states[i]);
+		pr_debug("psci-power-state %#x index %d\n", state, i);
+		if (!psci_power_state_is_valid(state)) {
+			pr_warn("Invalid PSCI power state %#x\n", state);
+			ret = -EINVAL;
+			goto free_mem;
+		}
+		psci_states[i] = state;
 	}
 	/* Idle states parsed correctly, initialize per-cpu pointer */
 	per_cpu(psci_power_state, cpu) = psci_states;
@@ -275,6 +262,46 @@ static void psci_sys_poweroff(void)
 	invoke_psci_fn(PSCI_0_2_FN_SYSTEM_OFF, 0, 0, 0);
 }
 
+/*
+ * Detect the presence of a resident Trusted OS which may cause CPU_OFF to
+ * return DENIED (which would be fatal).
+ */
+static void __init psci_init_migrate(void)
+{
+	unsigned long cpuid;
+	int type, cpu;
+
+	type = psci_ops.migrate_info_type();
+
+	if (type == PSCI_0_2_TOS_MP) {
+		pr_info("Trusted OS migration not required\n");
+		return;
+	}
+
+	if (type == PSCI_RET_NOT_SUPPORTED) {
+		pr_info("MIGRATE_INFO_TYPE not supported.\n");
+		return;
+	}
+
+	if (type != PSCI_0_2_TOS_UP_MIGRATE &&
+	    type != PSCI_0_2_TOS_UP_NO_MIGRATE) {
+		pr_err("MIGRATE_INFO_TYPE returned unknown type (%d)\n", type);
+		return;
+	}
+
+	cpuid = psci_migrate_info_up_cpu();
+	if (cpuid & ~MPIDR_HWID_BITMASK) {
+		pr_warn("MIGRATE_INFO_UP_CPU reported invalid physical ID (0x%lx)\n",
+			cpuid);
+		return;
+	}
+
+	cpu = get_logical_index(cpuid);
+	resident_cpu = cpu >= 0 ? cpu : -1;
+
+	pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
+}
+
 static void __init psci_0_2_set_functions(void)
 {
 	pr_info("Using standard PSCI v0.2 function IDs\n");
@@ -290,11 +317,8 @@ static void __init psci_0_2_set_functions(void)
 	psci_function_id[PSCI_FN_MIGRATE] = PSCI_0_2_FN64_MIGRATE;
 	psci_ops.migrate = psci_migrate;
 
-	psci_function_id[PSCI_FN_AFFINITY_INFO] = PSCI_0_2_FN64_AFFINITY_INFO;
 	psci_ops.affinity_info = psci_affinity_info;
 
-	psci_function_id[PSCI_FN_MIGRATE_INFO_TYPE] =
-		PSCI_0_2_FN_MIGRATE_INFO_TYPE;
 	psci_ops.migrate_info_type = psci_migrate_info_type;
 
 	arm_pm_restart = psci_sys_reset;
@@ -307,32 +331,26 @@ static void __init psci_0_2_set_functions(void)
  */
 static int __init psci_probe(void)
 {
-	int ver = psci_get_version();
-
-	if (ver == PSCI_RET_NOT_SUPPORTED) {
-		/*
-		 * PSCI versions >=0.2 mandates implementation of
-		 * PSCI_VERSION.
-		 */
-		pr_err("PSCI firmware does not comply with the v0.2 spec.\n");
-		return -EOPNOTSUPP;
-	} else {
-		pr_info("PSCIv%d.%d detected in firmware.\n",
-				PSCI_VERSION_MAJOR(ver),
-				PSCI_VERSION_MINOR(ver));
-
-		if (PSCI_VERSION_MAJOR(ver) == 0 &&
-				PSCI_VERSION_MINOR(ver) < 2) {
-			pr_err("Conflicting PSCI version detected.\n");
-			return -EINVAL;
-		}
+	u32 ver = psci_get_version();
+
+	pr_info("PSCIv%d.%d detected in firmware.\n",
+			PSCI_VERSION_MAJOR(ver),
+			PSCI_VERSION_MINOR(ver));
+
+	if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {
+		pr_err("Conflicting PSCI version detected.\n");
+		return -EINVAL;
 	}
 
 	psci_0_2_set_functions();
 
+	psci_init_migrate();
+
 	return 0;
 }
 
+typedef int (*psci_initcall_t)(const struct device_node *);
+
 /*
  * PSCI init function for PSCI versions >=0.2
  *
@@ -421,6 +439,7 @@ int __init psci_dt_init(void)
 	return init_fn(np);
 }
 
+#ifdef CONFIG_ACPI
 /*
  * We use PSCI 0.2+ when ACPI is deployed on ARM64 and it's
  * explicitly clarified in SBBR
@@ -441,10 +460,11 @@ int __init psci_acpi_init(void)
 
 	return psci_probe();
 }
+#endif
 
 #ifdef CONFIG_SMP
 
-static int __init cpu_psci_cpu_init(struct device_node *dn, unsigned int cpu)
+static int __init cpu_psci_cpu_init(unsigned int cpu)
 {
 	return 0;
 }
@@ -469,11 +489,21 @@ static int cpu_psci_cpu_boot(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
+static bool psci_tos_resident_on(int cpu)
+{
+	return cpu == resident_cpu;
+}
+
 static int cpu_psci_cpu_disable(unsigned int cpu)
 {
 	/* Fail early if we don't have CPU_OFF support */
 	if (!psci_ops.cpu_off)
 		return -EOPNOTSUPP;
+
+	/* Trusted OS will deny CPU_OFF */
+	if (psci_tos_resident_on(cpu))
+		return -EPERM;
+
 	return 0;
 }
 
@@ -484,9 +514,8 @@ static void cpu_psci_cpu_die(unsigned int cpu)
 	 * There are no known implementations of PSCI actually using the
 	 * power state field, pass a sensible default for now.
 	 */
-	struct psci_power_state state = {
-		.type = PSCI_POWER_STATE_TYPE_POWER_DOWN,
-	};
+	u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN <<
+		    PSCI_0_2_POWER_STATE_TYPE_SHIFT;
 
 	ret = psci_ops.cpu_off(state);
 
@@ -498,7 +527,7 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
 	int err, i;
 
 	if (!psci_ops.affinity_info)
-		return 1;
+		return 0;
 	/*
 	 * cpu_kill could race with cpu_die and we can
 	 * potentially end up declaring this cpu undead
@@ -509,7 +538,7 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
 		err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
 		if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
 			pr_info("CPU%d killed.\n", cpu);
-			return 1;
+			return 0;
 		}
 
 		msleep(10);
@@ -518,15 +547,14 @@ static int cpu_psci_cpu_kill(unsigned int cpu)
 
 	pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
 			cpu, err);
-	/* Make op_cpu_kill() fail. */
-	return 0;
+	return -ETIMEDOUT;
 }
 #endif
 #endif
 
 static int psci_suspend_finisher(unsigned long index)
 {
-	struct psci_power_state *state = __this_cpu_read(psci_power_state);
+	u32 *state = __this_cpu_read(psci_power_state);
 
 	return psci_ops.cpu_suspend(state[index - 1],
 				    virt_to_phys(cpu_resume));
@@ -535,7 +563,7 @@ static int psci_suspend_finisher(unsigned long index)
 static int __maybe_unused cpu_psci_cpu_suspend(unsigned long index)
 {
 	int ret;
-	struct psci_power_state *state = __this_cpu_read(psci_power_state);
+	u32 *state = __this_cpu_read(psci_power_state);
 	/*
 	 * idle state index 0 corresponds to wfi, should never be called
 	 * from the cpu_suspend operations
@@ -543,10 +571,10 @@ static int __maybe_unused cpu_psci_cpu_suspend(unsigned long index)
 	if (WARN_ON_ONCE(!index))
 		return -EINVAL;
 
-	if (state[index - 1].type == PSCI_POWER_STATE_TYPE_STANDBY)
+	if (!psci_power_state_loses_context(state[index - 1]))
 		ret = psci_ops.cpu_suspend(state[index - 1], 0);
 	else
-		ret = __cpu_suspend(index, psci_suspend_finisher);
+		ret = cpu_suspend(index, psci_suspend_finisher);
 
 	return ret;
 }
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 74753132c3ac..ffd3970721bf 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -105,18 +105,6 @@ static struct resource mem_res[] = {
 #define kernel_code mem_res[0]
 #define kernel_data mem_res[1]
 
-void __init early_print(const char *str, ...)
-{
-	char buf[256];
-	va_list ap;
-
-	va_start(ap, str);
-	vsnprintf(buf, sizeof(buf), str, ap);
-	va_end(ap);
-
-	printk("%s", buf);
-}
-
 /*
  * The recorded values of x0 .. x3 upon kernel entry.
  */
@@ -326,12 +314,14 @@ static void __init setup_processor(void)
 
 static void __init setup_machine_fdt(phys_addr_t dt_phys)
 {
-	if (!dt_phys || !early_init_dt_scan(phys_to_virt(dt_phys))) {
-		early_print("\n"
-			"Error: invalid device tree blob at physical address 0x%p (virtual address 0x%p)\n"
-			"The dtb must be 8-byte aligned and passed in the first 512MB of memory\n"
-			"\nPlease check your bootloader.\n",
-			dt_phys, phys_to_virt(dt_phys));
+	void *dt_virt = fixmap_remap_fdt(dt_phys);
+
+	if (!dt_virt || !early_init_dt_scan(dt_virt)) {
+		pr_crit("\n"
+			"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
+			"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
+			"\nPlease check your bootloader.",
+			&dt_phys, dt_virt);
 
 		while (true)
 			cpu_relax();
@@ -374,8 +364,6 @@ void __init setup_arch(char **cmdline_p)
 {
 	setup_processor();
 
-	setup_machine_fdt(__fdt_pointer);
-
 	init_mm.start_code = (unsigned long) _text;
 	init_mm.end_code   = (unsigned long) _etext;
 	init_mm.end_data   = (unsigned long) _edata;
@@ -386,6 +374,8 @@ void __init setup_arch(char **cmdline_p)
 	early_fixmap_init();
 	early_ioremap_init();
 
+	setup_machine_fdt(__fdt_pointer);
+
 	parse_early_param();
 
 	/*
@@ -408,16 +398,13 @@ void __init setup_arch(char **cmdline_p)
 	if (acpi_disabled) {
 		unflatten_device_tree();
 		psci_dt_init();
-		cpu_read_bootcpu_ops();
-#ifdef CONFIG_SMP
-		of_smp_init_cpus();
-#endif
 	} else {
 		psci_acpi_init();
-		acpi_init_cpus();
 	}
 
+	cpu_read_bootcpu_ops();
 #ifdef CONFIG_SMP
+	smp_init_cpus();
 	smp_build_mpidr_hash();
 #endif
 
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index d26fcd4cd6e6..1670f15ef69e 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -370,7 +370,7 @@ badframe:
 	if (show_unhandled_signals)
 		pr_info_ratelimited("%s[%d]: bad frame in %s: pc=%08llx sp=%08llx\n",
 				    current->comm, task_pid_nr(current), __func__,
-				    regs->pc, regs->sp);
+				    regs->pc, regs->compat_sp);
 	force_sig(SIGSEGV, current);
 	return 0;
 }
@@ -407,7 +407,7 @@ badframe:
 	if (show_unhandled_signals)
 		pr_info_ratelimited("%s[%d]: bad frame in %s: pc=%08llx sp=%08llx\n",
 				    current->comm, task_pid_nr(current), __func__,
-				    regs->pc, regs->sp);
+				    regs->pc, regs->compat_sp);
 	force_sig(SIGSEGV, current);
 	return 0;
 }
diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S
index ede186cdd452..803cfea41962 100644
--- a/arch/arm64/kernel/sleep.S
+++ b/arch/arm64/kernel/sleep.S
@@ -130,12 +130,14 @@ ENDPROC(__cpu_suspend_enter)
 /*
  * x0 must contain the sctlr value retrieved from restored context
  */
+	.pushsection	".idmap.text", "ax"
 ENTRY(cpu_resume_mmu)
 	ldr	x3, =cpu_resume_after_mmu
 	msr	sctlr_el1, x0		// restore sctlr_el1
 	isb
 	br	x3			// global jump to virtual address
 ENDPROC(cpu_resume_mmu)
+	.popsection
 cpu_resume_after_mmu:
 	mov	x0, #0			// return zero on success
 	ldp	x19, x20, [sp, #16]
@@ -162,15 +164,12 @@ ENTRY(cpu_resume)
 #else
 	mov	x7, xzr
 #endif
-	adrp	x0, sleep_save_sp
-	add	x0, x0, #:lo12:sleep_save_sp
-	ldr	x0, [x0, #SLEEP_SAVE_SP_PHYS]
+	ldr_l	x0, sleep_save_sp + SLEEP_SAVE_SP_PHYS
 	ldr	x0, [x0, x7, lsl #3]
 	/* load sp from context */
 	ldr	x2, [x0, #CPU_CTX_SP]
-	adrp	x1, sleep_idmap_phys
 	/* load physical address of identity map page table in x1 */
-	ldr	x1, [x1, #:lo12:sleep_idmap_phys]
+	adrp	x1, idmap_pg_dir
 	mov	sp, x2
 	/*
 	 * cpu_do_resume expects x0 to contain context physical address
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 2cb008177252..4b2121bd7f9c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -17,6 +17,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/acpi.h>
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
@@ -248,20 +249,20 @@ static int op_cpu_kill(unsigned int cpu)
 	 * time and hope that it's dead, so let's skip the wait and just hope.
 	 */
 	if (!cpu_ops[cpu]->cpu_kill)
-		return 1;
+		return 0;
 
 	return cpu_ops[cpu]->cpu_kill(cpu);
 }
 
-static DECLARE_COMPLETION(cpu_died);
-
 /*
  * called on the thread which is asking for a CPU to be shutdown -
  * waits until shutdown has completed, or it is timed out.
  */
 void __cpu_die(unsigned int cpu)
 {
-	if (!wait_for_completion_timeout(&cpu_died, msecs_to_jiffies(5000))) {
+	int err;
+
+	if (!cpu_wait_death(cpu, 5)) {
 		pr_crit("CPU%u: cpu didn't die\n", cpu);
 		return;
 	}
@@ -273,8 +274,10 @@ void __cpu_die(unsigned int cpu)
 	 * verify that it has really left the kernel before we consider
 	 * clobbering anything it might still be using.
 	 */
-	if (!op_cpu_kill(cpu))
-		pr_warn("CPU%d may not have shut down cleanly\n", cpu);
+	err = op_cpu_kill(cpu);
+	if (err)
+		pr_warn("CPU%d may not have shut down cleanly: %d\n",
+			cpu, err);
 }
 
 /*
@@ -294,7 +297,7 @@ void cpu_die(void)
 	local_irq_disable();
 
 	/* Tell __cpu_die() that this CPU is now safe to dispose of */
-	complete(&cpu_died);
+	(void)cpu_report_death();
 
 	/*
 	 * Actually shutdown the CPU. This must never fail. The specific hotplug
@@ -318,57 +321,158 @@ void __init smp_prepare_boot_cpu(void)
 	set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
 }
 
+static u64 __init of_get_cpu_mpidr(struct device_node *dn)
+{
+	const __be32 *cell;
+	u64 hwid;
+
+	/*
+	 * A cpu node with missing "reg" property is
+	 * considered invalid to build a cpu_logical_map
+	 * entry.
+	 */
+	cell = of_get_property(dn, "reg", NULL);
+	if (!cell) {
+		pr_err("%s: missing reg property\n", dn->full_name);
+		return INVALID_HWID;
+	}
+
+	hwid = of_read_number(cell, of_n_addr_cells(dn));
+	/*
+	 * Non affinity bits must be set to 0 in the DT
+	 */
+	if (hwid & ~MPIDR_HWID_BITMASK) {
+		pr_err("%s: invalid reg property\n", dn->full_name);
+		return INVALID_HWID;
+	}
+	return hwid;
+}
+
+/*
+ * Duplicate MPIDRs are a recipe for disaster. Scan all initialized
+ * entries and check for duplicates. If any is found just ignore the
+ * cpu. cpu_logical_map was initialized to INVALID_HWID to avoid
+ * matching valid MPIDR values.
+ */
+static bool __init is_mpidr_duplicate(unsigned int cpu, u64 hwid)
+{
+	unsigned int i;
+
+	for (i = 1; (i < cpu) && (i < NR_CPUS); i++)
+		if (cpu_logical_map(i) == hwid)
+			return true;
+	return false;
+}
+
+/*
+ * Initialize cpu operations for a logical cpu and
+ * set it in the possible mask on success
+ */
+static int __init smp_cpu_setup(int cpu)
+{
+	if (cpu_read_ops(cpu))
+		return -ENODEV;
+
+	if (cpu_ops[cpu]->cpu_init(cpu))
+		return -ENODEV;
+
+	set_cpu_possible(cpu, true);
+
+	return 0;
+}
+
+static bool bootcpu_valid __initdata;
+static unsigned int cpu_count = 1;
+
+#ifdef CONFIG_ACPI
+/*
+ * acpi_map_gic_cpu_interface - parse processor MADT entry
+ *
+ * Carry out sanity checks on MADT processor entry and initialize
+ * cpu_logical_map on success
+ */
+static void __init
+acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
+{
+	u64 hwid = processor->arm_mpidr;
+
+	if (hwid & ~MPIDR_HWID_BITMASK || hwid == INVALID_HWID) {
+		pr_err("skipping CPU entry with invalid MPIDR 0x%llx\n", hwid);
+		return;
+	}
+
+	if (!(processor->flags & ACPI_MADT_ENABLED)) {
+		pr_err("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid);
+		return;
+	}
+
+	if (is_mpidr_duplicate(cpu_count, hwid)) {
+		pr_err("duplicate CPU MPIDR 0x%llx in MADT\n", hwid);
+		return;
+	}
+
+	/* Check if GICC structure of boot CPU is available in the MADT */
+	if (cpu_logical_map(0) == hwid) {
+		if (bootcpu_valid) {
+			pr_err("duplicate boot CPU MPIDR: 0x%llx in MADT\n",
+			       hwid);
+			return;
+		}
+		bootcpu_valid = true;
+		return;
+	}
+
+	if (cpu_count >= NR_CPUS)
+		return;
+
+	/* map the logical cpu id to cpu MPIDR */
+	cpu_logical_map(cpu_count) = hwid;
+
+	cpu_count++;
+}
+
+static int __init
+acpi_parse_gic_cpu_interface(struct acpi_subtable_header *header,
+			     const unsigned long end)
+{
+	struct acpi_madt_generic_interrupt *processor;
+
+	processor = (struct acpi_madt_generic_interrupt *)header;
+	if (BAD_MADT_ENTRY(processor, end))
+		return -EINVAL;
+
+	acpi_table_print_madt_entry(header);
+
+	acpi_map_gic_cpu_interface(processor);
+
+	return 0;
+}
+#else
+#define acpi_table_parse_madt(...)	do { } while (0)
+#endif
+
 /*
  * Enumerate the possible CPU set from the device tree and build the
  * cpu logical map array containing MPIDR values related to logical
  * cpus. Assumes that cpu_logical_map(0) has already been initialized.
  */
-void __init of_smp_init_cpus(void)
+void __init of_parse_and_init_cpus(void)
 {
 	struct device_node *dn = NULL;
-	unsigned int i, cpu = 1;
-	bool bootcpu_valid = false;
 
 	while ((dn = of_find_node_by_type(dn, "cpu"))) {
-		const u32 *cell;
-		u64 hwid;
+		u64 hwid = of_get_cpu_mpidr(dn);
 
-		/*
-		 * A cpu node with missing "reg" property is
-		 * considered invalid to build a cpu_logical_map
-		 * entry.
-		 */
-		cell = of_get_property(dn, "reg", NULL);
-		if (!cell) {
-			pr_err("%s: missing reg property\n", dn->full_name);
+		if (hwid == INVALID_HWID)
 			goto next;
-		}
-		hwid = of_read_number(cell, of_n_addr_cells(dn));
 
-		/*
-		 * Non affinity bits must be set to 0 in the DT
-		 */
-		if (hwid & ~MPIDR_HWID_BITMASK) {
-			pr_err("%s: invalid reg property\n", dn->full_name);
+		if (is_mpidr_duplicate(cpu_count, hwid)) {
+			pr_err("%s: duplicate cpu reg properties in the DT\n",
+				dn->full_name);
 			goto next;
 		}
 
 		/*
-		 * Duplicate MPIDRs are a recipe for disaster. Scan
-		 * all initialized entries and check for
-		 * duplicates. If any is found just ignore the cpu.
-		 * cpu_logical_map was initialized to INVALID_HWID to
-		 * avoid matching valid MPIDR values.
-		 */
-		for (i = 1; (i < cpu) && (i < NR_CPUS); i++) {
-			if (cpu_logical_map(i) == hwid) {
-				pr_err("%s: duplicate cpu reg properties in the DT\n",
-					dn->full_name);
-				goto next;
-			}
-		}
-
-		/*
 		 * The numbering scheme requires that the boot CPU
 		 * must be assigned logical id 0. Record it so that
 		 * the logical map built from DT is validated and can
@@ -392,38 +496,58 @@ void __init of_smp_init_cpus(void)
 			continue;
 		}
 
-		if (cpu >= NR_CPUS)
-			goto next;
-
-		if (cpu_read_ops(dn, cpu) != 0)
-			goto next;
-
-		if (cpu_ops[cpu]->cpu_init(dn, cpu))
+		if (cpu_count >= NR_CPUS)
 			goto next;
 
 		pr_debug("cpu logical map 0x%llx\n", hwid);
-		cpu_logical_map(cpu) = hwid;
+		cpu_logical_map(cpu_count) = hwid;
 next:
-		cpu++;
+		cpu_count++;
 	}
+}
+
+/*
+ * Enumerate the possible CPU set from the device tree or ACPI and build the
+ * cpu logical map array containing MPIDR values related to logical
+ * cpus. Assumes that cpu_logical_map(0) has already been initialized.
+ */
+void __init smp_init_cpus(void)
+{
+	int i;
 
-	/* sanity check */
-	if (cpu > NR_CPUS)
-		pr_warning("no. of cores (%d) greater than configured maximum of %d - clipping\n",
-			   cpu, NR_CPUS);
+	if (acpi_disabled)
+		of_parse_and_init_cpus();
+	else
+		/*
+		 * do a walk of MADT to determine how many CPUs
+		 * we have including disabled CPUs, and get information
+		 * we need for SMP init
+		 */
+		acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
+				      acpi_parse_gic_cpu_interface, 0);
+
+	if (cpu_count > NR_CPUS)
+		pr_warn("no. of cores (%d) greater than configured maximum of %d - clipping\n",
+			cpu_count, NR_CPUS);
 
 	if (!bootcpu_valid) {
-		pr_err("DT missing boot CPU MPIDR, not enabling secondaries\n");
+		pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
 		return;
 	}
 
 	/*
-	 * All the cpus that made it to the cpu_logical_map have been
-	 * validated so set them as possible cpus.
+	 * We need to set the cpu_logical_map entries before enabling
+	 * the cpus so that cpu processor description entries (DT cpu nodes
+	 * and ACPI MADT entries) can be retrieved by matching the cpu hwid
+	 * with entries in cpu_logical_map while initializing the cpus.
+	 * If the cpu set-up fails, invalidate the cpu_logical_map entry.
 	 */
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_logical_map(i) != INVALID_HWID)
-			set_cpu_possible(i, true);
+	for (i = 1; i < NR_CPUS; i++) {
+		if (cpu_logical_map(i) != INVALID_HWID) {
+			if (smp_cpu_setup(i))
+				cpu_logical_map(i) = INVALID_HWID;
+		}
+	}
 }
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c
index 14944e5b28da..aef3605a8c47 100644
--- a/arch/arm64/kernel/smp_spin_table.c
+++ b/arch/arm64/kernel/smp_spin_table.c
@@ -49,8 +49,14 @@ static void write_pen_release(u64 val)
 }
 
 
-static int smp_spin_table_cpu_init(struct device_node *dn, unsigned int cpu)
+static int smp_spin_table_cpu_init(unsigned int cpu)
 {
+	struct device_node *dn;
+
+	dn = of_get_cpu_node(cpu, NULL);
+	if (!dn)
+		return -ENODEV;
+
 	/*
 	 * Determine the address from which the CPU is polling.
 	 */
diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c
index d7daf45ae7a2..8297d502217e 100644
--- a/arch/arm64/kernel/suspend.c
+++ b/arch/arm64/kernel/suspend.c
@@ -51,13 +51,13 @@ void __init cpu_suspend_set_dbg_restorer(void (*hw_bp_restore)(void *))
 }
 
 /*
- * __cpu_suspend
+ * cpu_suspend
  *
  * arg: argument to pass to the finisher function
  * fn: finisher function pointer
  *
  */
-int __cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
+int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 {
 	struct mm_struct *mm = current->active_mm;
 	int ret;
@@ -82,7 +82,7 @@ int __cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 		 * We are resuming from reset with TTBR0_EL1 set to the
 		 * idmap to enable the MMU; restore the active_mm mappings in
 		 * TTBR0_EL1 unless the active_mm == &init_mm, in which case
-		 * the thread entered __cpu_suspend with TTBR0_EL1 set to
+		 * the thread entered cpu_suspend with TTBR0_EL1 set to
 		 * reserved TTBR0 page tables and should be restored as such.
 		 */
 		if (mm == &init_mm)
@@ -118,7 +118,6 @@ int __cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 }
 
 struct sleep_save_sp sleep_save_sp;
-phys_addr_t sleep_idmap_phys;
 
 static int __init cpu_suspend_init(void)
 {
@@ -132,9 +131,7 @@ static int __init cpu_suspend_init(void)
 
 	sleep_save_sp.save_ptr_stash = ctx_ptr;
 	sleep_save_sp.save_ptr_stash_phys = virt_to_phys(ctx_ptr);
-	sleep_idmap_phys = virt_to_phys(idmap_pg_dir);
 	__flush_dcache_area(&sleep_save_sp, sizeof(struct sleep_save_sp));
-	__flush_dcache_area(&sleep_idmap_phys, sizeof(sleep_idmap_phys));
 
 	return 0;
 }
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 1ef2940df13c..a12251c074a8 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -335,8 +335,7 @@ asmlinkage void __exception do_undefinstr(struct pt_regs *regs)
 	if (call_undef_hook(regs) == 0)
 		return;
 
-	if (show_unhandled_signals && unhandled_signal(current, SIGILL) &&
-	    printk_ratelimit()) {
+	if (show_unhandled_signals_ratelimited() && unhandled_signal(current, SIGILL)) {
 		pr_info("%s[%d]: undefined instruction: pc=%p\n",
 			current->comm, task_pid_nr(current), pc);
 		dump_instr(KERN_INFO, regs);
@@ -363,7 +362,7 @@ asmlinkage long do_ni_syscall(struct pt_regs *regs)
 	}
 #endif
 
-	if (show_unhandled_signals && printk_ratelimit()) {
+	if (show_unhandled_signals_ratelimited()) {
 		pr_info("%s[%d]: syscall %d\n", current->comm,
 			task_pid_nr(current), (int)regs->syscallno);
 		dump_instr("", regs);
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index ff3bddea482d..f6fe17d88da5 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -15,6 +15,10 @@ ccflags-y := -shared -fno-common -fno-builtin
 ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \
 		$(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 
+# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared
+# down to collect2, resulting in silent corruption of the vDSO image.
+ccflags-y += -Wl,-shared
+
 obj-y += vdso.o
 extra-y += vdso.lds vdso-offsets.h
 CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index a2c29865c3fe..98073332e2d0 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -38,6 +38,12 @@ jiffies = jiffies_64;
 	*(.hyp.text)					\
 	VMLINUX_SYMBOL(__hyp_text_end) = .;
 
+#define IDMAP_TEXT					\
+	. = ALIGN(SZ_4K);				\
+	VMLINUX_SYMBOL(__idmap_text_start) = .;		\
+	*(.idmap.text)					\
+	VMLINUX_SYMBOL(__idmap_text_end) = .;
+
 /*
  * The size of the PE/COFF section that covers the kernel image, which
  * runs from stext to _edata, must be a round multiple of the PE/COFF
@@ -95,6 +101,7 @@ SECTIONS
 			SCHED_TEXT
 			LOCK_TEXT
 			HYPERVISOR_TEXT
+			IDMAP_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 		. = ALIGN(16);
@@ -167,11 +174,13 @@ SECTIONS
 }
 
 /*
- * The HYP init code can't be more than a page long,
+ * The HYP init code and ID map text can't be longer than a page each,
  * and should not cross a page boundary.
  */
 ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
 	"HYP init code too big or misaligned")
+ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
+	"ID map text too big or misaligned")
 
 /*
  * If padding is applied before .head.text, virt<->phys conversions will fail.
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 5105e297ed5f..bfffe8f4bd53 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
 	select KVM_ARM_HOST
 	select KVM_GENERIC_DIRTYLOG_READ_PROTECT
 	select SRCU
+	select KVM_VFIO
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQFD
 	---help---
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d5904f876cdb..f90f4aa7f88d 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -11,7 +11,7 @@ ARM=../../../arch/arm/kvm
 
 obj-$(CONFIG_KVM_ARM_HOST) += kvm.o
 
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/arm.o $(ARM)/mmu.o $(ARM)/mmio.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
index 5befd010e232..17a8fb14f428 100644
--- a/arch/arm64/kvm/hyp.S
+++ b/arch/arm64/kvm/hyp.S
@@ -17,8 +17,10 @@
 
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/asm-offsets.h>
 #include <asm/assembler.h>
+#include <asm/cpufeature.h>
 #include <asm/debug-monitors.h>
 #include <asm/esr.h>
 #include <asm/fpsimdmacros.h>
@@ -50,8 +52,8 @@
 	stp	x29, lr, [x3, #80]
 
 	mrs	x19, sp_el0
-	mrs	x20, elr_el2		// EL1 PC
-	mrs	x21, spsr_el2		// EL1 pstate
+	mrs	x20, elr_el2		// pc before entering el2
+	mrs	x21, spsr_el2		// pstate before entering el2
 
 	stp	x19, x20, [x3, #96]
 	str	x21, [x3, #112]
@@ -82,8 +84,8 @@
 	ldr	x21, [x3, #16]
 
 	msr	sp_el0, x19
-	msr	elr_el2, x20 				// EL1 PC
-	msr	spsr_el2, x21 				// EL1 pstate
+	msr	elr_el2, x20 		// pc on return from el2
+	msr	spsr_el2, x21 		// pstate on return from el2
 
 	add	x3, x2, #CPU_XREG_OFFSET(19)
 	ldp	x19, x20, [x3]
@@ -808,10 +810,7 @@
  * Call into the vgic backend for state saving
  */
 .macro save_vgic_state
-	adr	x24, __vgic_sr_vectors
-	ldr	x24, [x24, VGIC_SAVE_FN]
-	kern_hyp_va	x24
-	blr	x24
+	alternative_insn "bl __save_vgic_v2_state", "bl __save_vgic_v3_state", ARM64_HAS_SYSREG_GIC_CPUIF
 	mrs	x24, hcr_el2
 	mov	x25, #HCR_INT_OVERRIDE
 	neg	x25, x25
@@ -828,10 +827,7 @@
 	orr	x24, x24, #HCR_INT_OVERRIDE
 	orr	x24, x24, x25
 	msr	hcr_el2, x24
-	adr	x24, __vgic_sr_vectors
-	ldr	x24, [x24, #VGIC_RESTORE_FN]
-	kern_hyp_va	x24
-	blr	x24
+	alternative_insn "bl __restore_vgic_v2_state", "bl __restore_vgic_v3_state", ARM64_HAS_SYSREG_GIC_CPUIF
 .endm
 
 .macro save_timer_state
@@ -1062,12 +1058,6 @@ ENTRY(__kvm_flush_vm_context)
 	ret
 ENDPROC(__kvm_flush_vm_context)
 
-	// struct vgic_sr_vectors __vgi_sr_vectors;
-	.align 3
-ENTRY(__vgic_sr_vectors)
-	.skip	VGIC_SR_VECTOR_SZ
-ENDPROC(__vgic_sr_vectors)
-
 __kvm_hyp_panic:
 	// Guess the context by looking at VTTBR:
 	// If zero, then we're already a host.
diff --git a/arch/arm64/kvm/vgic-v2-switch.S b/arch/arm64/kvm/vgic-v2-switch.S
index f002fe1c3700..3f000712a85d 100644
--- a/arch/arm64/kvm/vgic-v2-switch.S
+++ b/arch/arm64/kvm/vgic-v2-switch.S
@@ -47,7 +47,6 @@ __save_vgic_v2_state:
 	add	x3, x0, #VCPU_VGIC_CPU
 
 	/* Save all interesting registers */
-	ldr	w4, [x2, #GICH_HCR]
 	ldr	w5, [x2, #GICH_VMCR]
 	ldr	w6, [x2, #GICH_MISR]
 	ldr	w7, [x2, #GICH_EISR0]
@@ -55,7 +54,6 @@ __save_vgic_v2_state:
 	ldr	w9, [x2, #GICH_ELRSR0]
 	ldr	w10, [x2, #GICH_ELRSR1]
 	ldr	w11, [x2, #GICH_APR]
-CPU_BE(	rev	w4,  w4  )
 CPU_BE(	rev	w5,  w5  )
 CPU_BE(	rev	w6,  w6  )
 CPU_BE(	rev	w7,  w7  )
@@ -64,7 +62,6 @@ CPU_BE(	rev	w9,  w9  )
 CPU_BE(	rev	w10, w10 )
 CPU_BE(	rev	w11, w11 )
 
-	str	w4, [x3, #VGIC_V2_CPU_HCR]
 	str	w5, [x3, #VGIC_V2_CPU_VMCR]
 	str	w6, [x3, #VGIC_V2_CPU_MISR]
 CPU_LE(	str	w7, [x3, #VGIC_V2_CPU_EISR] )
diff --git a/arch/arm64/kvm/vgic-v3-switch.S b/arch/arm64/kvm/vgic-v3-switch.S
index 617a012a0107..3c20730ddff5 100644
--- a/arch/arm64/kvm/vgic-v3-switch.S
+++ b/arch/arm64/kvm/vgic-v3-switch.S
@@ -48,13 +48,11 @@
 	dsb	st
 
 	// Save all interesting registers
-	mrs_s	x4, ICH_HCR_EL2
 	mrs_s	x5, ICH_VMCR_EL2
 	mrs_s	x6, ICH_MISR_EL2
 	mrs_s	x7, ICH_EISR_EL2
 	mrs_s	x8, ICH_ELSR_EL2
 
-	str	w4, [x3, #VGIC_V3_CPU_HCR]
 	str	w5, [x3, #VGIC_V3_CPU_VMCR]
 	str	w6, [x3, #VGIC_V3_CPU_MISR]
 	str	w7, [x3, #VGIC_V3_CPU_EISR]
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 773d37a14039..9d84feb41a16 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -4,3 +4,5 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   context.o proc.o pageattr.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_ARM64_PTDUMP)	+= dump.o
+
+CFLAGS_mmu.o			:= -I$(srctree)/scripts/dtc/libfdt/
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 2560e1e1562e..bdeb5d38c2dd 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -22,84 +22,11 @@
 #include <linux/init.h>
 #include <asm/assembler.h>
 #include <asm/cpufeature.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
 
 #include "proc-macros.S"
 
 /*
- *	__flush_dcache_all()
- *
- *	Flush the whole D-cache.
- *
- *	Corrupted registers: x0-x7, x9-x11
- */
-__flush_dcache_all:
-	dmb	sy				// ensure ordering with previous memory accesses
-	mrs	x0, clidr_el1			// read clidr
-	and	x3, x0, #0x7000000		// extract loc from clidr
-	lsr	x3, x3, #23			// left align loc bit field
-	cbz	x3, finished			// if loc is 0, then no need to clean
-	mov	x10, #0				// start clean at cache level 0
-loop1:
-	add	x2, x10, x10, lsr #1		// work out 3x current cache level
-	lsr	x1, x0, x2			// extract cache type bits from clidr
-	and	x1, x1, #7			// mask of the bits for current cache only
-	cmp	x1, #2				// see what cache we have at this level
-	b.lt	skip				// skip if no cache, or just i-cache
-	save_and_disable_irqs x9		// make CSSELR and CCSIDR access atomic
-	msr	csselr_el1, x10			// select current cache level in csselr
-	isb					// isb to sych the new cssr&csidr
-	mrs	x1, ccsidr_el1			// read the new ccsidr
-	restore_irqs x9
-	and	x2, x1, #7			// extract the length of the cache lines
-	add	x2, x2, #4			// add 4 (line length offset)
-	mov	x4, #0x3ff
-	and	x4, x4, x1, lsr #3		// find maximum number on the way size
-	clz	w5, w4				// find bit position of way size increment
-	mov	x7, #0x7fff
-	and	x7, x7, x1, lsr #13		// extract max number of the index size
-loop2:
-	mov	x9, x4				// create working copy of max way size
-loop3:
-	lsl	x6, x9, x5
-	orr	x11, x10, x6			// factor way and cache number into x11
-	lsl	x6, x7, x2
-	orr	x11, x11, x6			// factor index number into x11
-	dc	cisw, x11			// clean & invalidate by set/way
-	subs	x9, x9, #1			// decrement the way
-	b.ge	loop3
-	subs	x7, x7, #1			// decrement the index
-	b.ge	loop2
-skip:
-	add	x10, x10, #2			// increment cache number
-	cmp	x3, x10
-	b.gt	loop1
-finished:
-	mov	x10, #0				// swith back to cache level 0
-	msr	csselr_el1, x10			// select current cache level in csselr
-	dsb	sy
-	isb
-	ret
-ENDPROC(__flush_dcache_all)
-
-/*
- *	flush_cache_all()
- *
- *	Flush the entire cache system.  The data cache flush is now achieved
- *	using atomic clean / invalidates working outwards from L1 cache. This
- *	is done using Set/Way based cache maintainance instructions.  The
- *	instruction cache can still be invalidated back to the point of
- *	unification in a single instruction.
- */
-ENTRY(flush_cache_all)
-	mov	x12, lr
-	bl	__flush_dcache_all
-	mov	x0, #0
-	ic	ialluis				// I+BTB cache invalidate
-	ret	x12
-ENDPROC(flush_cache_all)
-
-/*
  *	flush_icache_range(start,end)
  *
  *	Ensure that the I and D caches are coherent within specified region.
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index baa758d37021..76c1e6cd36fc 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -92,6 +92,14 @@ static void reset_context(void *info)
 	unsigned int cpu = smp_processor_id();
 	struct mm_struct *mm = current->active_mm;
 
+	/*
+	 * current->active_mm could be init_mm for the idle thread immediately
+	 * after secondary CPU boot or hotplug. TTBR0_EL1 is already set to
+	 * the reserved value, so no need to reset any context.
+	 */
+	if (mm == &init_mm)
+		return;
+
 	smp_rmb();
 	asid = cpu_last_asid + cpu;
 
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index b0bd4e5fd5cf..d16a1cead23f 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -414,6 +414,98 @@ out:
 	return -ENOMEM;
 }
 
+/********************************************
+ * The following APIs are for dummy DMA ops *
+ ********************************************/
+
+static void *__dummy_alloc(struct device *dev, size_t size,
+			   dma_addr_t *dma_handle, gfp_t flags,
+			   struct dma_attrs *attrs)
+{
+	return NULL;
+}
+
+static void __dummy_free(struct device *dev, size_t size,
+			 void *vaddr, dma_addr_t dma_handle,
+			 struct dma_attrs *attrs)
+{
+}
+
+static int __dummy_mmap(struct device *dev,
+			struct vm_area_struct *vma,
+			void *cpu_addr, dma_addr_t dma_addr, size_t size,
+			struct dma_attrs *attrs)
+{
+	return -ENXIO;
+}
+
+static dma_addr_t __dummy_map_page(struct device *dev, struct page *page,
+				   unsigned long offset, size_t size,
+				   enum dma_data_direction dir,
+				   struct dma_attrs *attrs)
+{
+	return DMA_ERROR_CODE;
+}
+
+static void __dummy_unmap_page(struct device *dev, dma_addr_t dev_addr,
+			       size_t size, enum dma_data_direction dir,
+			       struct dma_attrs *attrs)
+{
+}
+
+static int __dummy_map_sg(struct device *dev, struct scatterlist *sgl,
+			  int nelems, enum dma_data_direction dir,
+			  struct dma_attrs *attrs)
+{
+	return 0;
+}
+
+static void __dummy_unmap_sg(struct device *dev,
+			     struct scatterlist *sgl, int nelems,
+			     enum dma_data_direction dir,
+			     struct dma_attrs *attrs)
+{
+}
+
+static void __dummy_sync_single(struct device *dev,
+				dma_addr_t dev_addr, size_t size,
+				enum dma_data_direction dir)
+{
+}
+
+static void __dummy_sync_sg(struct device *dev,
+			    struct scatterlist *sgl, int nelems,
+			    enum dma_data_direction dir)
+{
+}
+
+static int __dummy_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
+{
+	return 1;
+}
+
+static int __dummy_dma_supported(struct device *hwdev, u64 mask)
+{
+	return 0;
+}
+
+struct dma_map_ops dummy_dma_ops = {
+	.alloc                  = __dummy_alloc,
+	.free                   = __dummy_free,
+	.mmap                   = __dummy_mmap,
+	.map_page               = __dummy_map_page,
+	.unmap_page             = __dummy_unmap_page,
+	.map_sg                 = __dummy_map_sg,
+	.unmap_sg               = __dummy_unmap_sg,
+	.sync_single_for_cpu    = __dummy_sync_single,
+	.sync_single_for_device = __dummy_sync_single,
+	.sync_sg_for_cpu        = __dummy_sync_sg,
+	.sync_sg_for_device     = __dummy_sync_sg,
+	.mapping_error          = __dummy_mapping_error,
+	.dma_supported          = __dummy_dma_supported,
+};
+EXPORT_SYMBOL(dummy_dma_ops);
+
 static int __init arm64_dma_init(void)
 {
 	int ret;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 96da13167d4a..b1fc69cd1499 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -115,8 +115,7 @@ static void __do_user_fault(struct task_struct *tsk, unsigned long addr,
 {
 	struct siginfo si;
 
-	if (show_unhandled_signals && unhandled_signal(tsk, sig) &&
-	    printk_ratelimit()) {
+	if (show_unhandled_signals_ratelimited() && unhandled_signal(tsk, sig)) {
 		pr_info("%s[%d]: unhandled %s (%d) at 0x%08lx, esr 0x%03x\n",
 			tsk->comm, task_pid_nr(tsk), fault_name(esr), sig,
 			addr, esr);
@@ -211,7 +210,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
 	 * If we're in an interrupt or have no user context, we must not take
 	 * the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
@@ -478,12 +477,19 @@ asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
 					   struct pt_regs *regs)
 {
 	struct siginfo info;
+	struct task_struct *tsk = current;
+
+	if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS))
+		pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n",
+				    tsk->comm, task_pid_nr(tsk),
+				    esr_get_class_string(esr), (void *)regs->pc,
+				    (void *)regs->sp);
 
 	info.si_signo = SIGBUS;
 	info.si_errno = 0;
 	info.si_code  = BUS_ADRALN;
 	info.si_addr  = (void __user *)addr;
-	arm64_notify_die("", regs, &info, esr);
+	arm64_notify_die("Oops - SP/PC alignment exception", regs, &info, esr);
 }
 
 static struct fault_info debug_fault_info[] = {
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index b6f14e8d2121..4dfa3975ce5b 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -102,7 +102,6 @@ EXPORT_SYMBOL(flush_dcache_page);
 /*
  * Additional functions defined in assembly.
  */
-EXPORT_SYMBOL(flush_cache_all);
 EXPORT_SYMBOL(flush_icache_range);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 597831bdddf3..ad87ce826cce 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -262,7 +262,7 @@ static void __init free_unused_memmap(void)
 		 * memmap entries are valid from the bank end aligned to
 		 * MAX_ORDER_NR_PAGES.
 		 */
-		prev_end = ALIGN(start + __phys_to_pfn(reg->size),
+		prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
 				 MAX_ORDER_NR_PAGES);
 	}
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 5b8b664422d3..82d3435bf14f 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -21,6 +21,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/libfdt.h>
 #include <linux/mman.h>
 #include <linux/nodemask.h>
 #include <linux/memblock.h>
@@ -643,3 +644,68 @@ void __set_fixmap(enum fixed_addresses idx,
 		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
 	}
 }
+
+void *__init fixmap_remap_fdt(phys_addr_t dt_phys)
+{
+	const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
+	pgprot_t prot = PAGE_KERNEL | PTE_RDONLY;
+	int granularity, size, offset;
+	void *dt_virt;
+
+	/*
+	 * Check whether the physical FDT address is set and meets the minimum
+	 * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
+	 * at least 8 bytes so that we can always access the size field of the
+	 * FDT header after mapping the first chunk, double check here if that
+	 * is indeed the case.
+	 */
+	BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
+	if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
+		return NULL;
+
+	/*
+	 * Make sure that the FDT region can be mapped without the need to
+	 * allocate additional translation table pages, so that it is safe
+	 * to call create_mapping() this early.
+	 *
+	 * On 64k pages, the FDT will be mapped using PTEs, so we need to
+	 * be in the same PMD as the rest of the fixmap.
+	 * On 4k pages, we'll use section mappings for the FDT so we only
+	 * have to be in the same PUD.
+	 */
+	BUILD_BUG_ON(dt_virt_base % SZ_2M);
+
+	if (IS_ENABLED(CONFIG_ARM64_64K_PAGES)) {
+		BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> PMD_SHIFT !=
+			     __fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT);
+
+		granularity = PAGE_SIZE;
+	} else {
+		BUILD_BUG_ON(__fix_to_virt(FIX_FDT_END) >> PUD_SHIFT !=
+			     __fix_to_virt(FIX_BTMAP_BEGIN) >> PUD_SHIFT);
+
+		granularity = PMD_SIZE;
+	}
+
+	offset = dt_phys % granularity;
+	dt_virt = (void *)dt_virt_base + offset;
+
+	/* map the first chunk so we can read the size from the header */
+	create_mapping(round_down(dt_phys, granularity), dt_virt_base,
+		       granularity, prot);
+
+	if (fdt_check_header(dt_virt) != 0)
+		return NULL;
+
+	size = fdt_totalsize(dt_virt);
+	if (size > MAX_FDT_SIZE)
+		return NULL;
+
+	if (offset + size > granularity)
+		create_mapping(round_down(dt_phys, granularity), dt_virt_base,
+			       round_up(offset + size, granularity), prot);
+
+	memblock_reserve(dt_phys, size);
+
+	return dt_virt;
+}
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index cdd754e19b9b..39139a3aa16d 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -46,52 +46,6 @@
 #define MAIR(attr, mt)	((attr) << ((mt) * 8))
 
 /*
- *	cpu_cache_off()
- *
- *	Turn the CPU D-cache off.
- */
-ENTRY(cpu_cache_off)
-	mrs	x0, sctlr_el1
-	bic	x0, x0, #1 << 2			// clear SCTLR.C
-	msr	sctlr_el1, x0
-	isb
-	ret
-ENDPROC(cpu_cache_off)
-
-/*
- *	cpu_reset(loc)
- *
- *	Perform a soft reset of the system.  Put the CPU into the same state
- *	as it would be if it had been reset, and branch to what would be the
- *	reset vector. It must be executed with the flat identity mapping.
- *
- *	- loc   - location to jump to for soft reset
- */
-	.align	5
-ENTRY(cpu_reset)
-	mrs	x1, sctlr_el1
-	bic	x1, x1, #1
-	msr	sctlr_el1, x1			// disable the MMU
-	isb
-	ret	x0
-ENDPROC(cpu_reset)
-
-ENTRY(cpu_soft_restart)
-	/* Save address of cpu_reset() and reset address */
-	mov	x19, x0
-	mov	x20, x1
-
-	/* Turn D-cache off */
-	bl	cpu_cache_off
-
-	/* Push out all dirty data, and ensure cache is empty */
-	bl	flush_cache_all
-
-	mov	x0, x20
-	ret	x19
-ENDPROC(cpu_soft_restart)
-
-/*
  *	cpu_do_idle()
  *
  *	Idle the processor (wait for interrupt).
diff --git a/arch/avr32/include/asm/cmpxchg.h b/arch/avr32/include/asm/cmpxchg.h
index 962a6aeab787..366bbeaeb405 100644
--- a/arch/avr32/include/asm/cmpxchg.h
+++ b/arch/avr32/include/asm/cmpxchg.h
@@ -70,8 +70,6 @@ extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels(
    if something tries to do an invalid cmpxchg().  */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 				      unsigned long new, int size)
 {
diff --git a/arch/avr32/include/asm/io.h b/arch/avr32/include/asm/io.h
index 4f5ec2bb7172..e998ff5d8e1a 100644
--- a/arch/avr32/include/asm/io.h
+++ b/arch/avr32/include/asm/io.h
@@ -296,6 +296,7 @@ extern void __iounmap(void __iomem *addr);
 	__iounmap(addr)
 
 #define ioremap_wc ioremap_nocache
+#define ioremap_wt ioremap_nocache
 
 #define cached(addr) P1SEGADDR(addr)
 #define uncached(addr) P2SEGADDR(addr)
diff --git a/arch/avr32/include/asm/uaccess.h b/arch/avr32/include/asm/uaccess.h
index a46f7cf3e1ea..68cf638faf48 100644
--- a/arch/avr32/include/asm/uaccess.h
+++ b/arch/avr32/include/asm/uaccess.h
@@ -97,7 +97,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -116,7 +117,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -136,7 +138,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -158,7 +161,8 @@ static inline __kernel_size_t __copy_from_user(void *to,
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index d223a8b57c1e..c03533937a9f 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -14,11 +14,11 @@
 #include <linux/pagemap.h>
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
+#include <linux/uaccess.h>
 
 #include <asm/mmu_context.h>
 #include <asm/sysreg.h>
 #include <asm/tlb.h>
-#include <asm/uaccess.h>
 
 #ifdef CONFIG_KPROBES
 static inline int notify_page_fault(struct pt_regs *regs, int trap)
@@ -81,7 +81,7 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user context, we must
 	 * not take the fault...
 	 */
-	if (in_atomic() || !mm || regs->sr & SYSREG_BIT(GM))
+	if (faulthandler_disabled() || !mm || regs->sr & SYSREG_BIT(GM))
 		goto no_context;
 
 	local_irq_enable();
diff --git a/arch/blackfin/include/asm/io.h b/arch/blackfin/include/asm/io.h
index 4e8ad0523118..6abebe82d4e9 100644
--- a/arch/blackfin/include/asm/io.h
+++ b/arch/blackfin/include/asm/io.h
@@ -10,6 +10,7 @@
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <asm/byteorder.h>
+#include <asm/def_LPBlackfin.h>
 
 #define __raw_readb bfin_read8
 #define __raw_readw bfin_read16
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 83f12f2ed9e3..3066d40a6db1 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -8,7 +8,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/wait.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <arch/system.h>
 
 extern int find_fixup_code(struct pt_regs *);
@@ -109,11 +109,11 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
 	info.si_code = SEGV_MAPERR;
 
 	/*
-	 * If we're in an interrupt or "atomic" operation or have no
+	 * If we're in an interrupt, have pagefaults disabled or have no
 	 * user context, we must not take the fault.
 	 */
 
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/frv/include/asm/io.h b/arch/frv/include/asm/io.h
index 0b78bc89e840..a31b63ec4930 100644
--- a/arch/frv/include/asm/io.h
+++ b/arch/frv/include/asm/io.h
@@ -17,6 +17,8 @@
 
 #ifdef __KERNEL__
 
+#define ARCH_HAS_IOREMAP_WT
+
 #include <linux/types.h>
 #include <asm/virtconvert.h>
 #include <asm/string.h>
@@ -265,7 +267,7 @@ static inline void __iomem *ioremap_nocache(unsigned long physaddr, unsigned lon
 	return __ioremap(physaddr, size, IOMAP_NOCACHE_SER);
 }
 
-static inline void __iomem *ioremap_writethrough(unsigned long physaddr, unsigned long size)
+static inline void __iomem *ioremap_wt(unsigned long physaddr, unsigned long size)
 {
 	return __ioremap(physaddr, size, IOMAP_WRITETHROUGH);
 }
diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h
index 2035a4d3f9b9..a6d4ed042c70 100644
--- a/arch/frv/include/asm/pci.h
+++ b/arch/frv/include/asm/pci.h
@@ -41,16 +41,6 @@ extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
 /* Return the index of the PCI controller for device PDEV. */
 #define pci_controller_num(PDEV)	(0)
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 /*
  *	These are pretty much arbitrary with the CoMEM implementation.
  *	We have the whole address space to ourselves.
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index ec4917ddf678..61d99767fe16 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -19,9 +19,9 @@
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
-#include <asm/uaccess.h>
 #include <asm/gdb-stub.h>
 
 /*****************************************************************************/
@@ -78,7 +78,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(__frame))
diff --git a/arch/frv/mm/highmem.c b/arch/frv/mm/highmem.c
index bed9a9bd3c10..785344bbdc07 100644
--- a/arch/frv/mm/highmem.c
+++ b/arch/frv/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long paddr;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 	type = kmap_atomic_idx_push();
 	paddr = page_to_phys(page);
@@ -85,5 +86,6 @@ void __kunmap_atomic(void *kvaddr)
 	}
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h
index 9e7802911a57..a6e34e2acbba 100644
--- a/arch/hexagon/include/asm/cmpxchg.h
+++ b/arch/hexagon/include/asm/cmpxchg.h
@@ -64,7 +64,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
  *  looks just like atomic_cmpxchg on our arch currently with a bunch of
  *  variable casting.
  */
-#define __HAVE_ARCH_CMPXCHG 1
 
 #define cmpxchg(ptr, old, new)					\
 ({								\
diff --git a/arch/hexagon/include/asm/uaccess.h b/arch/hexagon/include/asm/uaccess.h
index e4127e4d6a5b..f000a382bc7f 100644
--- a/arch/hexagon/include/asm/uaccess.h
+++ b/arch/hexagon/include/asm/uaccess.h
@@ -36,7 +36,8 @@
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 76d25b2cfbbe..42a91a7aa2b0 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -137,29 +137,6 @@ config AUDIT_ARCH
 	bool
 	default y
 
-menuconfig PARAVIRT_GUEST
-	bool "Paravirtualized guest support"
-	depends on BROKEN
-	help
-	  Say Y here to get to see options related to running Linux under
-	  various hypervisors.  This option alone does not add any kernel code.
-
-	  If you say N, all options in this submenu will be skipped and disabled.
-
-if PARAVIRT_GUEST
-
-config PARAVIRT
-	bool "Enable paravirtualization code"
-	depends on PARAVIRT_GUEST
-	default y
-	help
-	  This changes the kernel so it can modify itself when it is run
-	  under a hypervisor, potentially improving performance significantly
-	  over full virtualization.  However, when run without a hypervisor
-	  the kernel is theoretically slower and slightly larger.
-
-endif
-
 choice
 	prompt "System type"
 	default IA64_GENERIC
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index f6769eb2bbf9..843ba435e43b 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,12 +77,7 @@ do {									\
 	___p1;								\
 })
 
-/*
- * XXX check on this ---I suspect what Linus really wants here is
- * acquire vs release semantics but we can't discuss this stuff with
- * Linus just yet.  Grrr...
- */
-#define set_mb(var, value)	do { (var) = (value); mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 /*
  * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/ia64/include/asm/hw_irq.h b/arch/ia64/include/asm/hw_irq.h
index 668786e84af8..74347ebf7d68 100644
--- a/arch/ia64/include/asm/hw_irq.h
+++ b/arch/ia64/include/asm/hw_irq.h
@@ -15,11 +15,7 @@
 #include <asm/ptrace.h>
 #include <asm/smp.h>
 
-#ifndef CONFIG_PARAVIRT
 typedef u8 ia64_vector;
-#else
-typedef u16 ia64_vector;
-#endif
 
 /*
  * 0 special
@@ -114,15 +110,11 @@ DECLARE_PER_CPU(int[IA64_NUM_VECTORS], vector_irq);
 
 extern struct irq_chip irq_type_ia64_lsapic;	/* CPU-internal interrupt controller */
 
-#ifdef CONFIG_PARAVIRT_GUEST
-#include <asm/paravirt.h>
-#else
 #define ia64_register_ipi	ia64_native_register_ipi
 #define assign_irq_vector	ia64_native_assign_irq_vector
 #define free_irq_vector		ia64_native_free_irq_vector
 #define register_percpu_irq	ia64_native_register_percpu_irq
 #define ia64_resend_irq		ia64_native_resend_irq
-#endif
 
 extern void ia64_native_register_ipi(void);
 extern int bind_irq_vector(int irq, int vector, cpumask_t domain);
diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
index 20477ea111ba..ec970a920132 100644
--- a/arch/ia64/include/asm/intrinsics.h
+++ b/arch/ia64/include/asm/intrinsics.h
@@ -7,19 +7,6 @@
 #ifndef _ASM_IA64_INTRINSICS_H
 #define _ASM_IA64_INTRINSICS_H
 
-#include <asm/paravirt_privop.h>
 #include <uapi/asm/intrinsics.h>
 
-#ifndef __ASSEMBLY__
-#if defined(CONFIG_PARAVIRT)
-# undef IA64_INTRINSIC_API
-# undef IA64_INTRINSIC_MACRO
-# ifdef ASM_SUPPORTED
-#  define IA64_INTRINSIC_API(name)	paravirt_ ## name
-# else
-#  define IA64_INTRINSIC_API(name)	pv_cpu_ops.name
-# endif
-#define IA64_INTRINSIC_MACRO(name)	paravirt_ ## name
-#endif
-#endif /* !__ASSEMBLY__ */
 #endif /* _ASM_IA64_INTRINSICS_H */
diff --git a/arch/ia64/include/asm/iosapic.h b/arch/ia64/include/asm/iosapic.h
index 94c89a2d97fe..4ae1fbd7f10e 100644
--- a/arch/ia64/include/asm/iosapic.h
+++ b/arch/ia64/include/asm/iosapic.h
@@ -55,14 +55,10 @@
 
 #define NR_IOSAPICS			256
 
-#ifdef CONFIG_PARAVIRT_GUEST
-#include <asm/paravirt.h>
-#else
 #define iosapic_pcat_compat_init	ia64_native_iosapic_pcat_compat_init
 #define __iosapic_read			__ia64_native_iosapic_read
 #define __iosapic_write			__ia64_native_iosapic_write
 #define iosapic_get_irq_chip		ia64_native_iosapic_get_irq_chip
-#endif
 
 extern void __init ia64_native_iosapic_pcat_compat_init(void);
 extern struct irq_chip *ia64_native_iosapic_get_irq_chip(unsigned long trigger);
diff --git a/arch/ia64/include/asm/irq_remapping.h b/arch/ia64/include/asm/irq_remapping.h
index e3b3556e2e1b..a8687b1d8906 100644
--- a/arch/ia64/include/asm/irq_remapping.h
+++ b/arch/ia64/include/asm/irq_remapping.h
@@ -1,6 +1,4 @@
 #ifndef __IA64_INTR_REMAPPING_H
 #define __IA64_INTR_REMAPPING_H
 #define irq_remapping_enabled 0
-#define dmar_alloc_hwirq	create_irq
-#define dmar_free_hwirq		destroy_irq
 #endif
diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h
index dfba22a872c3..f31894b2a354 100644
--- a/arch/ia64/include/asm/module.h
+++ b/arch/ia64/include/asm/module.h
@@ -18,12 +18,6 @@ struct mod_arch_specific {
 	struct elf64_shdr *got;		/* global offset table */
 	struct elf64_shdr *opd;		/* official procedure descriptors */
 	struct elf64_shdr *unwind;	/* unwind-table section */
-#ifdef CONFIG_PARAVIRT
-	struct elf64_shdr *paravirt_bundles;
-					/* paravirt_alt_bundle_patch table */
-	struct elf64_shdr *paravirt_insts;
-					/* paravirt_alt_inst_patch table */
-#endif
 	unsigned long gp;		/* global-pointer for module */
 
 	void *core_unw_table;		/* core unwind-table cookie returned by unwinder */
diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
index d2d46efb3e6e..7e08f17accd5 100644
--- a/arch/ia64/include/asm/native/inst.h
+++ b/arch/ia64/include/asm/native/inst.h
@@ -22,32 +22,6 @@
 
 #define DO_SAVE_MIN		IA64_NATIVE_DO_SAVE_MIN
 
-#define __paravirt_switch_to			ia64_native_switch_to
-#define __paravirt_leave_syscall		ia64_native_leave_syscall
-#define __paravirt_work_processed_syscall	ia64_native_work_processed_syscall
-#define __paravirt_leave_kernel			ia64_native_leave_kernel
-#define __paravirt_pending_syscall_end		ia64_work_pending_syscall_end
-#define __paravirt_work_processed_syscall_target \
-						ia64_work_processed_syscall
-
-#define paravirt_fsyscall_table			ia64_native_fsyscall_table
-#define paravirt_fsys_bubble_down		ia64_native_fsys_bubble_down
-
-#ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK
-# define PARAVIRT_POISON	0xdeadbeefbaadf00d
-# define CLOBBER(clob)				\
-	;;					\
-	movl clob = PARAVIRT_POISON;		\
-	;;
-# define CLOBBER_PRED(pred_clob)		\
-	;;					\
-	cmp.eq pred_clob, p0 = r0, r0		\
-	;;
-#else
-# define CLOBBER(clob)			/* nothing */
-# define CLOBBER_PRED(pred_clob)	/* nothing */
-#endif
-
 #define MOV_FROM_IFA(reg)	\
 	mov reg = cr.ifa
 
@@ -70,106 +44,76 @@
 	mov reg = cr.iip
 
 #define MOV_FROM_IVR(reg, clob)	\
-	mov reg = cr.ivr	\
-	CLOBBER(clob)
+	mov reg = cr.ivr
 
 #define MOV_FROM_PSR(pred, reg, clob)	\
-(pred)	mov reg = psr			\
-	CLOBBER(clob)
+(pred)	mov reg = psr
 
 #define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
-(pred)	mov reg = ar.itc				\
-	CLOBBER(clob)					\
-	CLOBBER_PRED(pred_clob)
+(pred)	mov reg = ar.itc
 
 #define MOV_TO_IFA(reg, clob)	\
-	mov cr.ifa = reg	\
-	CLOBBER(clob)
+	mov cr.ifa = reg
 
 #define MOV_TO_ITIR(pred, reg, clob)	\
-(pred)	mov cr.itir = reg		\
-	CLOBBER(clob)
+(pred)	mov cr.itir = reg
 
 #define MOV_TO_IHA(pred, reg, clob)	\
-(pred)	mov cr.iha = reg		\
-	CLOBBER(clob)
+(pred)	mov cr.iha = reg
 
 #define MOV_TO_IPSR(pred, reg, clob)		\
-(pred)	mov cr.ipsr = reg			\
-	CLOBBER(clob)
+(pred)	mov cr.ipsr = reg
 
 #define MOV_TO_IFS(pred, reg, clob)	\
-(pred)	mov cr.ifs = reg		\
-	CLOBBER(clob)
+(pred)	mov cr.ifs = reg
 
 #define MOV_TO_IIP(reg, clob)	\
-	mov cr.iip = reg	\
-	CLOBBER(clob)
+	mov cr.iip = reg
 
 #define MOV_TO_KR(kr, reg, clob0, clob1)	\
-	mov IA64_KR(kr) = reg			\
-	CLOBBER(clob0)				\
-	CLOBBER(clob1)
+	mov IA64_KR(kr) = reg
 
 #define ITC_I(pred, reg, clob)	\
-(pred)	itc.i reg		\
-	CLOBBER(clob)
+(pred)	itc.i reg
 
 #define ITC_D(pred, reg, clob)	\
-(pred)	itc.d reg		\
-	CLOBBER(clob)
+(pred)	itc.d reg
 
 #define ITC_I_AND_D(pred_i, pred_d, reg, clob)	\
 (pred_i) itc.i reg;				\
-(pred_d) itc.d reg				\
-	CLOBBER(clob)
+(pred_d) itc.d reg
 
 #define THASH(pred, reg0, reg1, clob)		\
-(pred)	thash reg0 = reg1			\
-	CLOBBER(clob)
+(pred)	thash reg0 = reg1
 
 #define SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(clob0, clob1)		\
 	ssm psr.ic | PSR_DEFAULT_BITS					\
-	CLOBBER(clob0)							\
-	CLOBBER(clob1)							\
 	;;								\
 	srlz.i /* guarantee that interruption collectin is on */	\
 	;;
 
 #define SSM_PSR_IC_AND_SRLZ_D(clob0, clob1)	\
 	ssm psr.ic				\
-	CLOBBER(clob0)				\
-	CLOBBER(clob1)				\
 	;;					\
 	srlz.d
 
 #define RSM_PSR_IC(clob)	\
-	rsm psr.ic		\
-	CLOBBER(clob)
+	rsm psr.ic
 
 #define SSM_PSR_I(pred, pred_clob, clob)	\
-(pred)	ssm psr.i				\
-	CLOBBER(clob)				\
-	CLOBBER_PRED(pred_clob)
+(pred)	ssm psr.i
 
 #define RSM_PSR_I(pred, clob0, clob1)	\
-(pred)	rsm psr.i			\
-	CLOBBER(clob0)			\
-	CLOBBER(clob1)
+(pred)	rsm psr.i
 
 #define RSM_PSR_I_IC(clob0, clob1, clob2)	\
-	rsm psr.i | psr.ic			\
-	CLOBBER(clob0)				\
-	CLOBBER(clob1)				\
-	CLOBBER(clob2)
+	rsm psr.i | psr.ic
 
 #define RSM_PSR_DT		\
 	rsm psr.dt
 
 #define RSM_PSR_BE_I(clob0, clob1)	\
-	rsm psr.be | psr.i		\
-	CLOBBER(clob0)			\
-	CLOBBER(clob1)
+	rsm psr.be | psr.i
 
 #define SSM_PSR_DT_AND_SRLZ_I	\
 	ssm psr.dt		\
@@ -177,15 +121,10 @@
 	srlz.i
 
 #define BSW_0(clob0, clob1, clob2)	\
-	bsw.0				\
-	CLOBBER(clob0)			\
-	CLOBBER(clob1)			\
-	CLOBBER(clob2)
+	bsw.0
 
 #define BSW_1(clob0, clob1)	\
-	bsw.1			\
-	CLOBBER(clob0)		\
-	CLOBBER(clob1)
+	bsw.1
 
 #define COVER	\
 	cover
diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h
deleted file mode 100644
index 8d72962ec838..000000000000
--- a/arch/ia64/include/asm/native/pvchk_inst.h
+++ /dev/null
@@ -1,271 +0,0 @@
-#ifndef _ASM_NATIVE_PVCHK_INST_H
-#define _ASM_NATIVE_PVCHK_INST_H
-
-/******************************************************************************
- * arch/ia64/include/asm/native/pvchk_inst.h
- * Checker for paravirtualizations of privileged operations.
- *
- * Copyright (C) 2005 Hewlett-Packard Co
- *      Dan Magenheimer <dan.magenheimer@hp.com>
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-/**********************************************
- * Instructions paravirtualized for correctness
- **********************************************/
-
-/* "fc" and "thash" are privilege-sensitive instructions, meaning they
- *  may have different semantics depending on whether they are executed
- *  at PL0 vs PL!=0.  When paravirtualized, these instructions mustn't
- *  be allowed to execute directly, lest incorrect semantics result.
- */
-
-#define fc	.error "fc should not be used directly."
-#define thash	.error "thash should not be used directly."
-
-/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
- * is not currently used (though it may be in a long-format VHPT system!)
- * and the semantics of cover only change if psr.ic is off which is very
- * rare (and currently non-existent outside of assembly code
- */
-#define ttag	.error "ttag should not be used directly."
-#define cover	.error "cover should not be used directly."
-
-/* There are also privilege-sensitive registers.  These registers are
- * readable at any privilege level but only writable at PL0.
- */
-#define cpuid	.error "cpuid should not be used directly."
-#define pmd	.error "pmd should not be used directly."
-
-/*
- * mov ar.eflag =
- * mov = ar.eflag
- */
-
-/**********************************************
- * Instructions paravirtualized for performance
- **********************************************/
-/*
- * Those instructions include '.' which can't be handled by cpp.
- * or can't be handled by cpp easily.
- * They are handled by sed instead of cpp.
- */
-
-/* for .S
- * itc.i
- * itc.d
- *
- * bsw.0
- * bsw.1
- *
- * ssm psr.ic | PSR_DEFAULT_BITS
- * ssm psr.ic
- * rsm psr.ic
- * ssm psr.i
- * rsm psr.i
- * rsm psr.i | psr.ic
- * rsm psr.dt
- * ssm psr.dt
- *
- * mov = cr.ifa
- * mov = cr.itir
- * mov = cr.isr
- * mov = cr.iha
- * mov = cr.ipsr
- * mov = cr.iim
- * mov = cr.iip
- * mov = cr.ivr
- * mov = psr
- *
- * mov cr.ifa =
- * mov cr.itir =
- * mov cr.iha =
- * mov cr.ipsr =
- * mov cr.ifs =
- * mov cr.iip =
- * mov cr.kr =
- */
-
-/* for intrinsics
- * ssm psr.i
- * rsm psr.i
- * mov = psr
- * mov = ivr
- * mov = tpr
- * mov cr.itm =
- * mov eoi =
- * mov rr[] =
- * mov = rr[]
- * mov = kr
- * mov kr =
- * ptc.ga
- */
-
-/*************************************************************
- * define paravirtualized instrcution macros as nop to ingore.
- * and check whether arguments are appropriate.
- *************************************************************/
-
-/* check whether reg is a regular register */
-.macro is_rreg_in reg
-	.ifc "\reg", "r0"
-		nop 0
-		.exitm
-	.endif
-	;;
-	mov \reg = r0
-	;;
-.endm
-#define IS_RREG_IN(reg)	is_rreg_in reg ;
-
-#define IS_RREG_OUT(reg)			\
-	;;					\
-	mov reg = r0				\
-	;;
-
-#define IS_RREG_CLOB(reg)	IS_RREG_OUT(reg)
-
-/* check whether pred is a predicate register */
-#define IS_PRED_IN(pred)			\
-	;;					\
-	(pred)	nop 0				\
-	;;
-
-#define IS_PRED_OUT(pred)			\
-	;;					\
-	cmp.eq pred, p0 = r0, r0		\
-	;;
-
-#define IS_PRED_CLOB(pred)	IS_PRED_OUT(pred)
-
-
-#define DO_SAVE_MIN(__COVER, SAVE_IFS, EXTRA, WORKAROUND)	\
-	nop 0
-#define MOV_FROM_IFA(reg)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_ITIR(reg)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_ISR(reg)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_IHA(reg)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_IPSR(pred, reg)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_IIM(reg)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_IIP(reg)			\
-	IS_RREG_OUT(reg)
-#define MOV_FROM_IVR(reg, clob)			\
-	IS_RREG_OUT(reg)			\
-	IS_RREG_CLOB(clob)
-#define MOV_FROM_PSR(pred, reg, clob)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_OUT(reg)			\
-	IS_RREG_CLOB(clob)
-#define MOV_FROM_ITC(pred, pred_clob, reg, clob)	\
-	IS_PRED_IN(pred)				\
-	IS_PRED_CLOB(pred_clob)				\
-	IS_RREG_OUT(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_IFA(reg, clob)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_ITIR(pred, reg, clob)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_IHA(pred, reg, clob)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_IPSR(pred, reg, clob)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_IFS(pred, reg, clob)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_IIP(reg, clob)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define MOV_TO_KR(kr, reg, clob0, clob1)	\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)
-#define ITC_I(pred, reg, clob)			\
-	IS_PRED_IN(pred)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define ITC_D(pred, reg, clob)			\
-	IS_PRED_IN(pred)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define ITC_I_AND_D(pred_i, pred_d, reg, clob)	\
-	IS_PRED_IN(pred_i)			\
-	IS_PRED_IN(pred_d)			\
-	IS_RREG_IN(reg)				\
-	IS_RREG_CLOB(clob)
-#define THASH(pred, reg0, reg1, clob)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_OUT(reg0)			\
-	IS_RREG_IN(reg1)			\
-	IS_RREG_CLOB(clob)
-#define SSM_PSR_IC_AND_DEFAULT_BITS_AND_SRLZ_I(clob0, clob1)	\
-	IS_RREG_CLOB(clob0)					\
-	IS_RREG_CLOB(clob1)
-#define SSM_PSR_IC_AND_SRLZ_D(clob0, clob1)	\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)
-#define RSM_PSR_IC(clob)			\
-	IS_RREG_CLOB(clob)
-#define SSM_PSR_I(pred, pred_clob, clob)	\
-	IS_PRED_IN(pred)			\
-	IS_PRED_CLOB(pred_clob)			\
-	IS_RREG_CLOB(clob)
-#define RSM_PSR_I(pred, clob0, clob1)		\
-	IS_PRED_IN(pred)			\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)
-#define RSM_PSR_I_IC(clob0, clob1, clob2)	\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)			\
-	IS_RREG_CLOB(clob2)
-#define RSM_PSR_DT				\
-	nop 0
-#define RSM_PSR_BE_I(clob0, clob1)		\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)
-#define SSM_PSR_DT_AND_SRLZ_I			\
-	nop 0
-#define BSW_0(clob0, clob1, clob2)		\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)			\
-	IS_RREG_CLOB(clob2)
-#define BSW_1(clob0, clob1)			\
-	IS_RREG_CLOB(clob0)			\
-	IS_RREG_CLOB(clob1)
-#define COVER					\
-	nop 0
-#define RFI					\
-	br.ret.sptk.many rp /* defining nop causes dependency error */
-
-#endif /* _ASM_NATIVE_PVCHK_INST_H */
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
deleted file mode 100644
index b53518a98026..000000000000
--- a/arch/ia64/include/asm/paravirt.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-
-#ifndef __ASM_PARAVIRT_H
-#define __ASM_PARAVIRT_H
-
-#ifndef __ASSEMBLY__
-/******************************************************************************
- * fsys related addresses
- */
-struct pv_fsys_data {
-	unsigned long *fsyscall_table;
-	void *fsys_bubble_down;
-};
-
-extern struct pv_fsys_data pv_fsys_data;
-
-unsigned long *paravirt_get_fsyscall_table(void);
-char *paravirt_get_fsys_bubble_down(void);
-
-/******************************************************************************
- * patchlist addresses for gate page
- */
-enum pv_gate_patchlist {
-	PV_GATE_START_FSYSCALL,
-	PV_GATE_END_FSYSCALL,
-
-	PV_GATE_START_BRL_FSYS_BUBBLE_DOWN,
-	PV_GATE_END_BRL_FSYS_BUBBLE_DOWN,
-
-	PV_GATE_START_VTOP,
-	PV_GATE_END_VTOP,
-
-	PV_GATE_START_MCKINLEY_E9,
-	PV_GATE_END_MCKINLEY_E9,
-};
-
-struct pv_patchdata {
-	unsigned long start_fsyscall_patchlist;
-	unsigned long end_fsyscall_patchlist;
-	unsigned long start_brl_fsys_bubble_down_patchlist;
-	unsigned long end_brl_fsys_bubble_down_patchlist;
-	unsigned long start_vtop_patchlist;
-	unsigned long end_vtop_patchlist;
-	unsigned long start_mckinley_e9_patchlist;
-	unsigned long end_mckinley_e9_patchlist;
-
-	void *gate_section;
-};
-
-extern struct pv_patchdata pv_patchdata;
-
-unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type);
-void *paravirt_get_gate_section(void);
-#endif
-
-#ifdef CONFIG_PARAVIRT_GUEST
-
-#define PARAVIRT_HYPERVISOR_TYPE_DEFAULT	0
-
-#ifndef __ASSEMBLY__
-
-#include <asm/hw_irq.h>
-#include <asm/meminit.h>
-
-/******************************************************************************
- * general info
- */
-struct pv_info {
-	unsigned int kernel_rpl;
-	int paravirt_enabled;
-	const char *name;
-};
-
-extern struct pv_info pv_info;
-
-static inline int paravirt_enabled(void)
-{
-	return pv_info.paravirt_enabled;
-}
-
-static inline unsigned int get_kernel_rpl(void)
-{
-	return pv_info.kernel_rpl;
-}
-
-/******************************************************************************
- * initialization hooks.
- */
-struct rsvd_region;
-
-struct pv_init_ops {
-	void (*banner)(void);
-
-	int (*reserve_memory)(struct rsvd_region *region);
-
-	void (*arch_setup_early)(void);
-	void (*arch_setup_console)(char **cmdline_p);
-	int (*arch_setup_nomca)(void);
-
-	void (*post_smp_prepare_boot_cpu)(void);
-
-#ifdef ASM_SUPPORTED
-	unsigned long (*patch_bundle)(void *sbundle, void *ebundle,
-				      unsigned long type);
-	unsigned long (*patch_inst)(unsigned long stag, unsigned long etag,
-				    unsigned long type);
-#endif
-	void (*patch_branch)(unsigned long tag, unsigned long type);
-};
-
-extern struct pv_init_ops pv_init_ops;
-
-static inline void paravirt_banner(void)
-{
-	if (pv_init_ops.banner)
-		pv_init_ops.banner();
-}
-
-static inline int paravirt_reserve_memory(struct rsvd_region *region)
-{
-	if (pv_init_ops.reserve_memory)
-		return pv_init_ops.reserve_memory(region);
-	return 0;
-}
-
-static inline void paravirt_arch_setup_early(void)
-{
-	if (pv_init_ops.arch_setup_early)
-		pv_init_ops.arch_setup_early();
-}
-
-static inline void paravirt_arch_setup_console(char **cmdline_p)
-{
-	if (pv_init_ops.arch_setup_console)
-		pv_init_ops.arch_setup_console(cmdline_p);
-}
-
-static inline int paravirt_arch_setup_nomca(void)
-{
-	if (pv_init_ops.arch_setup_nomca)
-		return pv_init_ops.arch_setup_nomca();
-	return 0;
-}
-
-static inline void paravirt_post_smp_prepare_boot_cpu(void)
-{
-	if (pv_init_ops.post_smp_prepare_boot_cpu)
-		pv_init_ops.post_smp_prepare_boot_cpu();
-}
-
-/******************************************************************************
- * replacement of iosapic operations.
- */
-
-struct pv_iosapic_ops {
-	void (*pcat_compat_init)(void);
-
-	struct irq_chip *(*__get_irq_chip)(unsigned long trigger);
-
-	unsigned int (*__read)(char __iomem *iosapic, unsigned int reg);
-	void (*__write)(char __iomem *iosapic, unsigned int reg, u32 val);
-};
-
-extern struct pv_iosapic_ops pv_iosapic_ops;
-
-static inline void
-iosapic_pcat_compat_init(void)
-{
-	if (pv_iosapic_ops.pcat_compat_init)
-		pv_iosapic_ops.pcat_compat_init();
-}
-
-static inline struct irq_chip*
-iosapic_get_irq_chip(unsigned long trigger)
-{
-	return pv_iosapic_ops.__get_irq_chip(trigger);
-}
-
-static inline unsigned int
-__iosapic_read(char __iomem *iosapic, unsigned int reg)
-{
-	return pv_iosapic_ops.__read(iosapic, reg);
-}
-
-static inline void
-__iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
-{
-	return pv_iosapic_ops.__write(iosapic, reg, val);
-}
-
-/******************************************************************************
- * replacement of irq operations.
- */
-
-struct pv_irq_ops {
-	void (*register_ipi)(void);
-
-	int (*assign_irq_vector)(int irq);
-	void (*free_irq_vector)(int vector);
-
-	void (*register_percpu_irq)(ia64_vector vec,
-				    struct irqaction *action);
-
-	void (*resend_irq)(unsigned int vector);
-};
-
-extern struct pv_irq_ops pv_irq_ops;
-
-static inline void
-ia64_register_ipi(void)
-{
-	pv_irq_ops.register_ipi();
-}
-
-static inline int
-assign_irq_vector(int irq)
-{
-	return pv_irq_ops.assign_irq_vector(irq);
-}
-
-static inline void
-free_irq_vector(int vector)
-{
-	return pv_irq_ops.free_irq_vector(vector);
-}
-
-static inline void
-register_percpu_irq(ia64_vector vec, struct irqaction *action)
-{
-	pv_irq_ops.register_percpu_irq(vec, action);
-}
-
-static inline void
-ia64_resend_irq(unsigned int vector)
-{
-	pv_irq_ops.resend_irq(vector);
-}
-
-/******************************************************************************
- * replacement of time operations.
- */
-
-extern struct itc_jitter_data_t itc_jitter_data;
-extern volatile int time_keeper_id;
-
-struct pv_time_ops {
-	void (*init_missing_ticks_accounting)(int cpu);
-	int (*do_steal_accounting)(unsigned long *new_itm);
-
-	void (*clocksource_resume)(void);
-
-	unsigned long long (*sched_clock)(void);
-};
-
-extern struct pv_time_ops pv_time_ops;
-
-static inline void
-paravirt_init_missing_ticks_accounting(int cpu)
-{
-	if (pv_time_ops.init_missing_ticks_accounting)
-		pv_time_ops.init_missing_ticks_accounting(cpu);
-}
-
-struct static_key;
-extern struct static_key paravirt_steal_enabled;
-extern struct static_key paravirt_steal_rq_enabled;
-
-static inline int
-paravirt_do_steal_accounting(unsigned long *new_itm)
-{
-	return pv_time_ops.do_steal_accounting(new_itm);
-}
-
-static inline unsigned long long paravirt_sched_clock(void)
-{
-	return pv_time_ops.sched_clock();
-}
-
-#endif /* !__ASSEMBLY__ */
-
-#else
-/* fallback for native case */
-
-#ifndef __ASSEMBLY__
-
-#define paravirt_banner()				do { } while (0)
-#define paravirt_reserve_memory(region)			0
-
-#define paravirt_arch_setup_early()			do { } while (0)
-#define paravirt_arch_setup_console(cmdline_p)		do { } while (0)
-#define paravirt_arch_setup_nomca()			0
-#define paravirt_post_smp_prepare_boot_cpu()		do { } while (0)
-
-#define paravirt_init_missing_ticks_accounting(cpu)	do { } while (0)
-#define paravirt_do_steal_accounting(new_itm)		0
-
-#endif /* __ASSEMBLY__ */
-
-
-#endif /* CONFIG_PARAVIRT_GUEST */
-
-#endif /* __ASM_PARAVIRT_H */
diff --git a/arch/ia64/include/asm/paravirt_patch.h b/arch/ia64/include/asm/paravirt_patch.h
deleted file mode 100644
index 128ff5db6e67..000000000000
--- a/arch/ia64/include/asm/paravirt_patch.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#ifndef __ASM_PARAVIRT_PATCH_H
-#define __ASM_PARAVIRT_PATCH_H
-
-#ifdef __ASSEMBLY__
-
-	.section .paravirt_branches, "a"
-	.previous
-#define PARAVIRT_PATCH_SITE_BR(type)		\
-	{					\
-	[1:] ;					\
-	br.cond.sptk.many 2f ;			\
-	nop.b 0 ;				\
-	nop.b 0;; ;				\
-	} ;					\
-	2:					\
-	.xdata8 ".paravirt_branches", 1b, type
-
-#else
-
-#include <linux/stringify.h>
-#include <asm/intrinsics.h>
-
-/* for binary patch */
-struct paravirt_patch_site_bundle {
-	void		*sbundle;
-	void		*ebundle;
-	unsigned long	type;
-};
-
-/* label means the beginning of new bundle */
-#define paravirt_alt_bundle(instr, privop)				\
-	"\t998:\n"							\
-	"\t" instr "\n"							\
-	"\t999:\n"							\
-	"\t.pushsection .paravirt_bundles, \"a\"\n"			\
-	"\t.popsection\n"						\
-	"\t.xdata8 \".paravirt_bundles\", 998b, 999b, "			\
-	__stringify(privop) "\n"
-
-
-struct paravirt_patch_bundle_elem {
-	const void	*sbundle;
-	const void	*ebundle;
-	unsigned long	type;
-};
-
-
-struct paravirt_patch_site_inst {
-	unsigned long	stag;
-	unsigned long	etag;
-	unsigned long	type;
-};
-
-#define paravirt_alt_inst(instr, privop)				\
-	"\t[998:]\n"							\
-	"\t" instr "\n"							\
-	"\t[999:]\n"							\
-	"\t.pushsection .paravirt_insts, \"a\"\n"			\
-	"\t.popsection\n"						\
-	"\t.xdata8 \".paravirt_insts\", 998b, 999b, "			\
-	__stringify(privop) "\n"
-
-struct paravirt_patch_site_branch {
-	unsigned long	tag;
-	unsigned long	type;
-};
-
-struct paravirt_patch_branch_target {
-	const void	*entry;
-	unsigned long	type;
-};
-
-void
-__paravirt_patch_apply_branch(
-	unsigned long tag, unsigned long type,
-	const struct paravirt_patch_branch_target *entries,
-	unsigned int nr_entries);
-
-void
-paravirt_patch_reloc_br(unsigned long tag, const void *target);
-
-void
-paravirt_patch_reloc_brl(unsigned long tag, const void *target);
-
-
-#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT)
-unsigned long
-ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
-
-unsigned long
-__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
-			      const struct paravirt_patch_bundle_elem *elems,
-			      unsigned long nelems,
-			      const struct paravirt_patch_bundle_elem **found);
-
-void
-paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
-			    const struct paravirt_patch_site_bundle *end);
-
-void
-paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
-			  const struct paravirt_patch_site_inst *end);
-
-void paravirt_patch_apply(void);
-#else
-#define paravirt_patch_apply_bundle(start, end)	do { } while (0)
-#define paravirt_patch_apply_inst(start, end)	do { } while (0)
-#define paravirt_patch_apply()			do { } while (0)
-#endif
-
-#endif /* !__ASSEMBLEY__ */
-
-#endif /* __ASM_PARAVIRT_PATCH_H */
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "linux"
- * c-basic-offset: 8
- * tab-width: 8
- * indent-tabs-mode: t
- * End:
- */
diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h
deleted file mode 100644
index 8f6cb11c9fae..000000000000
--- a/arch/ia64/include/asm/paravirt_privop.h
+++ /dev/null
@@ -1,479 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#ifndef _ASM_IA64_PARAVIRT_PRIVOP_H
-#define _ASM_IA64_PARAVIRT_PRIVOP_H
-
-#ifdef CONFIG_PARAVIRT
-
-#ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-#include <asm/kregs.h> /* for IA64_PSR_I */
-
-/******************************************************************************
- * replacement of intrinsics operations.
- */
-
-struct pv_cpu_ops {
-	void (*fc)(void *addr);
-	unsigned long (*thash)(unsigned long addr);
-	unsigned long (*get_cpuid)(int index);
-	unsigned long (*get_pmd)(int index);
-	unsigned long (*getreg)(int reg);
-	void (*setreg)(int reg, unsigned long val);
-	void (*ptcga)(unsigned long addr, unsigned long size);
-	unsigned long (*get_rr)(unsigned long index);
-	void (*set_rr)(unsigned long index, unsigned long val);
-	void (*set_rr0_to_rr4)(unsigned long val0, unsigned long val1,
-			       unsigned long val2, unsigned long val3,
-			       unsigned long val4);
-	void (*ssm_i)(void);
-	void (*rsm_i)(void);
-	unsigned long (*get_psr_i)(void);
-	void (*intrin_local_irq_restore)(unsigned long flags);
-};
-
-extern struct pv_cpu_ops pv_cpu_ops;
-
-extern void ia64_native_setreg_func(int regnum, unsigned long val);
-extern unsigned long ia64_native_getreg_func(int regnum);
-
-/************************************************/
-/* Instructions paravirtualized for performance */
-/************************************************/
-
-#ifndef ASM_SUPPORTED
-#define paravirt_ssm_i()	pv_cpu_ops.ssm_i()
-#define paravirt_rsm_i()	pv_cpu_ops.rsm_i()
-#define __paravirt_getreg()	pv_cpu_ops.getreg()
-#endif
-
-/* mask for ia64_native_ssm/rsm() must be constant.("i" constraing).
- * static inline function doesn't satisfy it. */
-#define paravirt_ssm(mask)			\
-	do {					\
-		if ((mask) == IA64_PSR_I)	\
-			paravirt_ssm_i();	\
-		else				\
-			ia64_native_ssm(mask);	\
-	} while (0)
-
-#define paravirt_rsm(mask)			\
-	do {					\
-		if ((mask) == IA64_PSR_I)	\
-			paravirt_rsm_i();	\
-		else				\
-			ia64_native_rsm(mask);	\
-	} while (0)
-
-/* returned ip value should be the one in the caller,
- * not in __paravirt_getreg() */
-#define paravirt_getreg(reg)					\
-	({							\
-		unsigned long res;				\
-		if ((reg) == _IA64_REG_IP)			\
-			res = ia64_native_getreg(_IA64_REG_IP); \
-		else						\
-			res = __paravirt_getreg(reg);		\
-		res;						\
-	})
-
-/******************************************************************************
- * replacement of hand written assembly codes.
- */
-struct pv_cpu_asm_switch {
-	unsigned long switch_to;
-	unsigned long leave_syscall;
-	unsigned long work_processed_syscall;
-	unsigned long leave_kernel;
-};
-void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
-
-#endif /* __ASSEMBLY__ */
-
-#define IA64_PARAVIRT_ASM_FUNC(name)	paravirt_ ## name
-
-#else
-
-/* fallback for native case */
-#define IA64_PARAVIRT_ASM_FUNC(name)	ia64_native_ ## name
-
-#endif /* CONFIG_PARAVIRT */
-
-#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED)
-#define paravirt_dv_serialize_data()	ia64_dv_serialize_data()
-#else
-#define paravirt_dv_serialize_data()	/* nothing */
-#endif
-
-/* these routines utilize privilege-sensitive or performance-sensitive
- * privileged instructions so the code must be replaced with
- * paravirtualized versions */
-#define ia64_switch_to			IA64_PARAVIRT_ASM_FUNC(switch_to)
-#define ia64_leave_syscall		IA64_PARAVIRT_ASM_FUNC(leave_syscall)
-#define ia64_work_processed_syscall	\
-	IA64_PARAVIRT_ASM_FUNC(work_processed_syscall)
-#define ia64_leave_kernel		IA64_PARAVIRT_ASM_FUNC(leave_kernel)
-
-
-#if defined(CONFIG_PARAVIRT)
-/******************************************************************************
- * binary patching infrastructure
- */
-#define PARAVIRT_PATCH_TYPE_FC				1
-#define PARAVIRT_PATCH_TYPE_THASH			2
-#define PARAVIRT_PATCH_TYPE_GET_CPUID			3
-#define PARAVIRT_PATCH_TYPE_GET_PMD			4
-#define PARAVIRT_PATCH_TYPE_PTCGA			5
-#define PARAVIRT_PATCH_TYPE_GET_RR			6
-#define PARAVIRT_PATCH_TYPE_SET_RR			7
-#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4		8
-#define PARAVIRT_PATCH_TYPE_SSM_I			9
-#define PARAVIRT_PATCH_TYPE_RSM_I			10
-#define PARAVIRT_PATCH_TYPE_GET_PSR_I			11
-#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE	12
-
-/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */
-#define PARAVIRT_PATCH_TYPE_GETREG			0x10000000
-#define PARAVIRT_PATCH_TYPE_SETREG			0x20000000
-
-/*
- * struct task_struct* (*ia64_switch_to)(void* next_task);
- * void *ia64_leave_syscall;
- * void *ia64_work_processed_syscall
- * void *ia64_leave_kernel;
- */
-
-#define PARAVIRT_PATCH_TYPE_BR_START			0x30000000
-#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO		\
-	(PARAVIRT_PATCH_TYPE_BR_START + 0)
-#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL		\
-	(PARAVIRT_PATCH_TYPE_BR_START + 1)
-#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL	\
-	(PARAVIRT_PATCH_TYPE_BR_START + 2)
-#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL		\
-	(PARAVIRT_PATCH_TYPE_BR_START + 3)
-
-#ifdef ASM_SUPPORTED
-#include <asm/paravirt_patch.h>
-
-/*
- * pv_cpu_ops calling stub.
- * normal function call convension can't be written by gcc
- * inline assembly.
- *
- * from the caller's point of view,
- * the following registers will be clobbered.
- * r2, r3
- * r8-r15
- * r16, r17
- * b6, b7
- * p6-p15
- * ar.ccv
- *
- * from the callee's point of view ,
- * the following registers can be used.
- * r2, r3: scratch
- * r8: scratch, input argument0 and return value
- * r0-r15: scratch, input argument1-5
- * b6: return pointer
- * b7: scratch
- * p6-p15: scratch
- * ar.ccv: scratch
- *
- * other registers must not be changed. especially
- * b0: rp: preserved. gcc ignores b0 in clobbered register.
- * r16: saved gp
- */
-/* 5 bundles */
-#define __PARAVIRT_BR							\
-	";;\n"								\
-	"{ .mlx\n"							\
-	"nop 0\n"							\
-	"movl r2 = %[op_addr]\n"/* get function pointer address */	\
-	";;\n"								\
-	"}\n"								\
-	"1:\n"								\
-	"{ .mii\n"							\
-	"ld8 r2 = [r2]\n"	/* load function descriptor address */	\
-	"mov r17 = ip\n"	/* get ip to calc return address */	\
-	"mov r16 = gp\n"	/* save gp */				\
-	";;\n"								\
-	"}\n"								\
-	"{ .mii\n"							\
-	"ld8 r3 = [r2], 8\n"	/* load entry address */		\
-	"adds r17 =  1f - 1b, r17\n"	/* calculate return address */	\
-	";;\n"								\
-	"mov b7 = r3\n"		/* set entry address */			\
-	"}\n"								\
-	"{ .mib\n"							\
-	"ld8 gp = [r2]\n"	/* load gp value */			\
-	"mov b6 = r17\n"	/* set return address */		\
-	"br.cond.sptk.few b7\n"	/* intrinsics are very short isns */	\
-	"}\n"								\
-	"1:\n"								\
-	"{ .mii\n"							\
-	"mov gp = r16\n"	/* restore gp value */			\
-	"nop 0\n"							\
-	"nop 0\n"							\
-	";;\n"								\
-	"}\n"
-
-#define PARAVIRT_OP(op)				\
-	[op_addr] "i"(&pv_cpu_ops.op)
-
-#define PARAVIRT_TYPE(type)			\
-	PARAVIRT_PATCH_TYPE_ ## type
-
-#define PARAVIRT_REG_CLOBBERS0					\
-	"r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14",	\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_REG_CLOBBERS1					\
-	"r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14",		\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_REG_CLOBBERS2					\
-	"r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14",	\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_REG_CLOBBERS5					\
-	"r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/	\
-		"r15", "r16", "r17"
-
-#define PARAVIRT_BR_CLOBBERS			\
-	"b6", "b7"
-
-#define PARAVIRT_PR_CLOBBERS						\
-	"p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"
-
-#define PARAVIRT_AR_CLOBBERS			\
-	"ar.ccv"
-
-#define PARAVIRT_CLOBBERS0			\
-		PARAVIRT_REG_CLOBBERS0,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_CLOBBERS1			\
-		PARAVIRT_REG_CLOBBERS1,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_CLOBBERS2			\
-		PARAVIRT_REG_CLOBBERS2,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_CLOBBERS5			\
-		PARAVIRT_REG_CLOBBERS5,		\
-		PARAVIRT_BR_CLOBBERS,		\
-		PARAVIRT_PR_CLOBBERS,		\
-		PARAVIRT_AR_CLOBBERS,		\
-		"memory"
-
-#define PARAVIRT_BR0(op, type)					\
-	register unsigned long ia64_clobber asm ("r8");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      :	"=r"(ia64_clobber)			\
-		      : PARAVIRT_OP(op)				\
-		      : PARAVIRT_CLOBBERS0)
-
-#define PARAVIRT_BR0_RET(op, type)				\
-	register unsigned long ia64_intri_res asm ("r8");	\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      : "=r"(ia64_intri_res)			\
-		      : PARAVIRT_OP(op)				\
-		      : PARAVIRT_CLOBBERS0)
-
-#define PARAVIRT_BR1(op, type, arg1)				\
-	register unsigned long __##arg1 asm ("r8") = arg1;	\
-	register unsigned long ia64_clobber asm ("r8");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      :	"=r"(ia64_clobber)			\
-		      : PARAVIRT_OP(op), "0"(__##arg1)		\
-		      : PARAVIRT_CLOBBERS1)
-
-#define PARAVIRT_BR1_RET(op, type, arg1)			\
-	register unsigned long ia64_intri_res asm ("r8");	\
-	register unsigned long __##arg1 asm ("r8") = arg1;	\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      : "=r"(ia64_intri_res)			\
-		      : PARAVIRT_OP(op), "0"(__##arg1)		\
-		      : PARAVIRT_CLOBBERS1)
-
-#define PARAVIRT_BR1_VOID(op, type, arg1)			\
-	register void *__##arg1 asm ("r8") = arg1;		\
-	register unsigned long ia64_clobber asm ("r8");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-					  PARAVIRT_TYPE(type))	\
-		      :	"=r"(ia64_clobber)			\
-		      : PARAVIRT_OP(op), "0"(__##arg1)		\
-		      : PARAVIRT_CLOBBERS1)
-
-#define PARAVIRT_BR2(op, type, arg1, arg2)				\
-	register unsigned long __##arg1 asm ("r8") = arg1;		\
-	register unsigned long __##arg2 asm ("r9") = arg2;		\
-	register unsigned long ia64_clobber1 asm ("r8");		\
-	register unsigned long ia64_clobber2 asm ("r9");		\
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,		\
-					  PARAVIRT_TYPE(type))		\
-		      : "=r"(ia64_clobber1), "=r"(ia64_clobber2)	\
-		      : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2)	\
-		      : PARAVIRT_CLOBBERS2)
-
-
-#define PARAVIRT_DEFINE_CPU_OP0(op, type)		\
-	static inline void				\
-	paravirt_ ## op (void)				\
-	{						\
-		PARAVIRT_BR0(op, type);			\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type)		\
-	static inline unsigned long			\
-	paravirt_ ## op (void)				\
-	{						\
-		PARAVIRT_BR0_RET(op, type);		\
-		return ia64_intri_res;			\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type)		\
-	static inline void				\
-	paravirt_ ## op (void *arg1)			\
-	{						\
-		PARAVIRT_BR1_VOID(op, type, arg1);	\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP1(op, type)		\
-	static inline void				\
-	paravirt_ ## op (unsigned long arg1)		\
-	{						\
-		PARAVIRT_BR1(op, type, arg1);		\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type)		\
-	static inline unsigned long			\
-	paravirt_ ## op (unsigned long arg1)		\
-	{						\
-		PARAVIRT_BR1_RET(op, type, arg1);	\
-		return ia64_intri_res;			\
-	}
-
-#define PARAVIRT_DEFINE_CPU_OP2(op, type)		\
-	static inline void				\
-	paravirt_ ## op (unsigned long arg1,		\
-			 unsigned long arg2)		\
-	{						\
-		PARAVIRT_BR2(op, type, arg1, arg2);	\
-	}
-
-
-PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC);
-PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH)
-PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID)
-PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD)
-PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA)
-PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR)
-PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR)
-PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I)
-PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I)
-PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I)
-PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE)
-
-static inline void
-paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
-			unsigned long val2, unsigned long val3,
-			unsigned long val4)
-{
-	register unsigned long __val0 asm ("r8") = val0;
-	register unsigned long __val1 asm ("r9") = val1;
-	register unsigned long __val2 asm ("r10") = val2;
-	register unsigned long __val3 asm ("r11") = val3;
-	register unsigned long __val4 asm ("r14") = val4;
-
-	register unsigned long ia64_clobber0 asm ("r8");
-	register unsigned long ia64_clobber1 asm ("r9");
-	register unsigned long ia64_clobber2 asm ("r10");
-	register unsigned long ia64_clobber3 asm ("r11");
-	register unsigned long ia64_clobber4 asm ("r14");
-
-	asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,
-					  PARAVIRT_TYPE(SET_RR0_TO_RR4))
-		      : "=r"(ia64_clobber0),
-			"=r"(ia64_clobber1),
-			"=r"(ia64_clobber2),
-			"=r"(ia64_clobber3),
-			"=r"(ia64_clobber4)
-		      : PARAVIRT_OP(set_rr0_to_rr4),
-			"0"(__val0), "1"(__val1), "2"(__val2),
-			"3"(__val3), "4"(__val4)
-		      : PARAVIRT_CLOBBERS5);
-}
-
-/* unsigned long paravirt_getreg(int reg) */
-#define __paravirt_getreg(reg)						\
-	({								\
-		register unsigned long ia64_intri_res asm ("r8");	\
-		register unsigned long __reg asm ("r8") = (reg);	\
-									\
-		asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-						  PARAVIRT_TYPE(GETREG) \
-						  + (reg))		\
-			      : "=r"(ia64_intri_res)			\
-			      : PARAVIRT_OP(getreg), "0"(__reg)		\
-			      : PARAVIRT_CLOBBERS1);			\
-									\
-		ia64_intri_res;						\
-	})
-
-/* void paravirt_setreg(int reg, unsigned long val) */
-#define paravirt_setreg(reg, val)					\
-	do {								\
-		register unsigned long __val asm ("r8") = val;		\
-		register unsigned long __reg asm ("r9") = reg;		\
-		register unsigned long ia64_clobber1 asm ("r8");	\
-		register unsigned long ia64_clobber2 asm ("r9");	\
-									\
-		asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,	\
-						  PARAVIRT_TYPE(SETREG) \
-						  + (reg))		\
-			      : "=r"(ia64_clobber1),			\
-				"=r"(ia64_clobber2)			\
-			      : PARAVIRT_OP(setreg),			\
-				"1"(__reg), "0"(__val)			\
-			      : PARAVIRT_CLOBBERS2);			\
-	} while (0)
-
-#endif /* ASM_SUPPORTED */
-#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */
-
-#endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */
diff --git a/arch/ia64/include/asm/pci.h b/arch/ia64/include/asm/pci.h
index 52af5ed9f60b..b897fae1f0ca 100644
--- a/arch/ia64/include/asm/pci.h
+++ b/arch/ia64/include/asm/pci.h
@@ -52,25 +52,6 @@ extern unsigned long ia64_max_iommu_merge_mask;
 
 #include <asm-generic/pci-dma-compat.h>
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
 				enum pci_mmap_state mmap_state, int write_combine);
@@ -108,19 +89,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 	return (pci_domain_nr(bus) != 0);
 }
 
-static inline struct resource *
-pcibios_select_root(struct pci_dev *pdev, struct resource *res)
-{
-	struct resource *root = NULL;
-
-	if (res->flags & IORESOURCE_IO)
-		root = &ioport_resource;
-	if (res->flags & IORESOURCE_MEM)
-		root = &iomem_resource;
-
-	return root;
-}
-
 #define HAVE_ARCH_PCI_GET_LEGACY_IDE_IRQ
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 6437ca21f61b..3ad8f6988363 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -53,7 +53,7 @@ void build_cpu_to_node_map(void);
 #define topology_physical_package_id(cpu)	(cpu_data(cpu)->socket_id)
 #define topology_core_id(cpu)			(cpu_data(cpu)->core_id)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif
 
 extern void arch_fix_phys_package_id(int num, u32 slot);
diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h
index f35109b1d907..a0e3620f8f13 100644
--- a/arch/ia64/include/uapi/asm/cmpxchg.h
+++ b/arch/ia64/include/uapi/asm/cmpxchg.h
@@ -61,8 +61,6 @@ extern void ia64_xchg_called_with_bad_pointer(void);
  * indicated by comparing RETURN with OLD.
  */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * This function doesn't exist, so you'll get a linker error
  * if something tries to do an invalid cmpxchg().
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index d68b5cf81e31..3686d6abafde 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -9,7 +9,7 @@ endif
 extra-y	:= head.o init_task.o vmlinux.lds
 
 obj-y := entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o		\
+	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o		\
 	 salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
 	 unwind.o mca.o mca_asm.o topology.o dma-mapping.o
 
@@ -35,9 +35,6 @@ mca_recovery-y			+= mca_drv.o mca_drv_asm.o
 obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirtentry.o \
-				   paravirt_patch.o
-
 obj-$(CONFIG_IA64_ESI)		+= esi.o
 ifneq ($(CONFIG_IA64_ESI),)
 obj-y				+= esi_stub.o	# must be in kernel proper
@@ -52,8 +49,6 @@ CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
 
 # The gate DSO image is built using a special linker script.
 include $(src)/Makefile.gate
-# tell compiled for native
-CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE
 
 # Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
 define sed-y
@@ -84,30 +79,3 @@ arch/$(SRCARCH)/kernel/nr-irqs.s: arch/$(SRCARCH)/kernel/nr-irqs.c
 include/generated/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
 	$(Q)mkdir -p $(dir $@)
 	$(call cmd,nr_irqs)
-
-#
-# native ivt.S, entry.S and fsys.S
-#
-ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o
-define paravirtualized_native
-AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE
-AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK
-extra-y += pvchk-$(1)
-endef
-$(foreach obj,$(ASM_PARAVIRT_OBJS),$(eval $(call paravirtualized_native,$(obj))))
-
-#
-# Checker for paravirtualizations of privileged operations.
-#
-quiet_cmd_pv_check_sed = PVCHK   $@
-define cmd_pv_check_sed
-	sed -f $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed $< > $@
-endef
-
-$(obj)/pvchk-sed-%.s: $(src)/%.S $(srctree)/arch/$(SRCARCH)/scripts/pvcheck.sed FORCE
-	$(call if_changed_dep,as_s_S)
-$(obj)/pvchk-%.s: $(obj)/pvchk-sed-%.s FORCE
-	$(call if_changed,pv_check_sed)
-$(obj)/pvchk-%.o: $(obj)/pvchk-%.s FORCE
-	$(call if_changed,as_o_S)
-.PRECIOUS: $(obj)/pvchk-sed-%.s $(obj)/pvchk-%.s $(obj)/pvchk-%.o
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index c52d7540dc05..47e962f7ed5a 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -464,7 +464,6 @@ efi_map_pal_code (void)
 		 GRANULEROUNDDOWN((unsigned long) pal_vaddr),
 		 pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
 		 IA64_GRANULE_SHIFT);
-	paravirt_dv_serialize_data();
 	ia64_set_psr(psr);		/* restore psr */
 }
 
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index fcf8b8cbca0b..ae0de7bf5525 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -51,7 +51,6 @@
 
 #include "minstate.h"
 
-#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 	/*
 	 * execve() is special because in case of success, we need to
 	 * setup a null register window frame.
@@ -161,7 +160,6 @@ GLOBAL_ENTRY(sys_clone)
 	mov rp=loc0
 	br.ret.sptk.many rp
 END(sys_clone)
-#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
 
 /*
  * prev_task <- ia64_switch_to(struct task_struct *next)
@@ -169,7 +167,7 @@ END(sys_clone)
  *	called.  The code starting at .map relies on this.  The rest of the code
  *	doesn't care about the interrupt masking status.
  */
-GLOBAL_ENTRY(__paravirt_switch_to)
+GLOBAL_ENTRY(ia64_switch_to)
 	.prologue
 	alloc r16=ar.pfs,1,0,0,0
 	DO_SAVE_SWITCH_STACK
@@ -221,9 +219,8 @@ GLOBAL_ENTRY(__paravirt_switch_to)
 	itr.d dtr[r25]=r23		// wire in new mapping...
 	SSM_PSR_IC_AND_SRLZ_D(r8, r9)	// reenable the psr.ic bit
 	br.cond.sptk .done
-END(__paravirt_switch_to)
+END(ia64_switch_to)
 
-#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 /*
  * Note that interrupts are enabled during save_switch_stack and load_switch_stack.  This
  * means that we may get an interrupt with "sp" pointing to the new kernel stack while
@@ -639,16 +636,8 @@ GLOBAL_ENTRY(ia64_ret_from_syscall)
 	adds r2=PT(R8)+16,sp			// r2 = &pt_regs.r8
 	mov r10=r0				// clear error indication in r10
 (p7)	br.cond.spnt handle_syscall_error	// handle potential syscall failure
-#ifdef CONFIG_PARAVIRT
-	;;
-	br.cond.sptk.few ia64_leave_syscall
-	;;
-#endif /* CONFIG_PARAVIRT */
 END(ia64_ret_from_syscall)
-#ifndef CONFIG_PARAVIRT
 	// fall through
-#endif
-#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
 
 /*
  * ia64_leave_syscall(): Same as ia64_leave_kernel, except that it doesn't
@@ -694,7 +683,7 @@ END(ia64_ret_from_syscall)
  *	      ar.csd: cleared
  *	      ar.ssd: cleared
  */
-GLOBAL_ENTRY(__paravirt_leave_syscall)
+GLOBAL_ENTRY(ia64_leave_syscall)
 	PT_REGS_UNWIND_INFO(0)
 	/*
 	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -722,8 +711,8 @@ GLOBAL_ENTRY(__paravirt_leave_syscall)
 	cmp.eq pLvSys,p0=r0,r0		// pLvSys=1: leave from syscall
 (pUStk)	cmp.eq.unc p6,p0=r0,r0		// p6 <- pUStk
 #endif
-.global __paravirt_work_processed_syscall;
-__paravirt_work_processed_syscall:
+.global ia64_work_processed_syscall;
+ia64_work_processed_syscall:
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	adds r2=PT(LOADRS)+16,r12
 	MOV_FROM_ITC(pUStk, p9, r22, r19)	// fetch time at leave
@@ -836,9 +825,9 @@ __paravirt_work_processed_syscall:
 	mov.m ar.ssd=r0			// M2   clear ar.ssd
 	mov f11=f0			// F    clear f11
 	br.cond.sptk.many rbs_switch	// B
-END(__paravirt_leave_syscall)
+END(ia64_leave_syscall)
 
-GLOBAL_ENTRY(__paravirt_leave_kernel)
+GLOBAL_ENTRY(ia64_leave_kernel)
 	PT_REGS_UNWIND_INFO(0)
 	/*
 	 * work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -1171,26 +1160,25 @@ skip_rbs_switch:
 (p6)	br.cond.sptk.few .notify
 	br.call.spnt.many rp=preempt_schedule_irq
 .ret9:	cmp.eq p6,p0=r0,r0	// p6 <- 1 (re-check)
-(pLvSys)br.cond.sptk.few  __paravirt_pending_syscall_end
+(pLvSys)br.cond.sptk.few  ia64_work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel
 
 .notify:
 (pUStk)	br.call.spnt.many rp=notify_resume_user
 .ret10:	cmp.ne p6,p0=r0,r0	// p6 <- 0 (don't re-check)
-(pLvSys)br.cond.sptk.few  __paravirt_pending_syscall_end
+(pLvSys)br.cond.sptk.few  ia64_work_pending_syscall_end
 	br.cond.sptk.many .work_processed_kernel
 
-.global __paravirt_pending_syscall_end;
-__paravirt_pending_syscall_end:
+.global ia64_work_pending_syscall_end;
+ia64_work_pending_syscall_end:
 	adds r2=PT(R8)+16,r12
 	adds r3=PT(R10)+16,r12
 	;;
 	ld8 r8=[r2]
 	ld8 r10=[r3]
-	br.cond.sptk.many __paravirt_work_processed_syscall_target
-END(__paravirt_leave_kernel)
+	br.cond.sptk.many ia64_work_processed_syscall
+END(ia64_leave_kernel)
 
-#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 ENTRY(handle_syscall_error)
 	/*
 	 * Some system calls (e.g., ptrace, mmap) can return arbitrary values which could
@@ -1294,7 +1282,7 @@ ENTRY(sys_rt_sigreturn)
 	adds sp=16,sp
 	;;
 	ld8 r9=[sp]				// load new ar.unat
-	mov.sptk b7=r8,ia64_native_leave_kernel
+	mov.sptk b7=r8,ia64_leave_kernel
 	;;
 	mov ar.unat=r9
 	br.many b7
@@ -1782,4 +1770,3 @@ sys_call_table:
 	data8 sys_execveat
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
-#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index abc6dee3799c..edbf7af95849 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -24,7 +24,7 @@
 #include <asm/unistd.h>
 
 #include "entry.h"
-#include "paravirt_inst.h"
+#include <asm/native/inst.h>
 
 /*
  * See Documentation/ia64/fsys.txt for details on fsyscalls.
@@ -402,7 +402,7 @@ ENTRY(fsys_fallback_syscall)
 	mov r26=ar.pfs
 END(fsys_fallback_syscall)
 	/* FALL THROUGH */
-GLOBAL_ENTRY(paravirt_fsys_bubble_down)
+GLOBAL_ENTRY(fsys_bubble_down)
 	.prologue
 	.altrp b6
 	.body
@@ -440,7 +440,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	 *
 	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
 	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
-	 * PSR.I  : already turned off by the time paravirt_fsys_bubble_down gets
+	 * PSR.I  : already turned off by the time fsys_bubble_down gets
 	 *	    invoked
 	 * PSR.DFL: always 0 (kernel never turns it on)
 	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
@@ -450,7 +450,7 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	 * PSR.DB : don't care --- kernel never enables kernel-level
 	 *	    breakpoints
 	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
-	 *          __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down
+	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
 	 *          will trigger a taken branch; the taken-trap-handler then
 	 *          converts the syscall into a break-based system-call.
 	 */
@@ -541,14 +541,14 @@ GLOBAL_ENTRY(paravirt_fsys_bubble_down)
 	nop.m 0
 (p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
 	br.cond.spnt ia64_trace_syscall		// B
-END(paravirt_fsys_bubble_down)
+END(fsys_bubble_down)
 
 	.rodata
 	.align 8
-	.globl paravirt_fsyscall_table
+	.globl fsyscall_table
 
-	data8 paravirt_fsys_bubble_down
-paravirt_fsyscall_table:
+	data8 fsys_bubble_down
+fsyscall_table:
 	data8 fsys_ni_syscall
 	data8 0				// exit			// 1025
 	data8 0				// read
@@ -833,4 +833,4 @@ paravirt_fsyscall_table:
 
 	// fill in zeros for the remaining entries
 	.zero:
-	.space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0
+	.space fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index b5f8bdd8618e..0bd1b3bfaf1c 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -14,7 +14,7 @@
 #include <asm/unistd.h>
 #include <asm/kregs.h>
 #include <asm/page.h>
-#include "paravirt_inst.h"
+#include <asm/native/inst.h>
 
 /*
  * We can't easily refer to symbols inside the kernel.  To avoid full runtime relocation,
@@ -376,11 +376,4 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 (p9)	mov r8=ENOSYS
 	FSYS_RETURN
 
-#ifdef CONFIG_PARAVIRT
-	/*
-	 * padd to make the size of this symbol constant
-	 * independent of paravirtualization.
-	 */
-	.align PAGE_SIZE / 8
-#endif
 END(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
index e518f7902af6..3e8271e85a1e 100644
--- a/arch/ia64/kernel/gate.lds.S
+++ b/arch/ia64/kernel/gate.lds.S
@@ -6,7 +6,6 @@
  */
 
 #include <asm/page.h>
-#include "paravirt_patchlist.h"
 
 SECTIONS
 {
@@ -33,21 +32,21 @@ SECTIONS
 	. = GATE_ADDR + 0x600;
 
 	.data..patch		: {
-		__paravirt_start_gate_mckinley_e9_patchlist = .;
+		__start_gate_mckinley_e9_patchlist = .;
 		*(.data..patch.mckinley_e9)
-		__paravirt_end_gate_mckinley_e9_patchlist = .;
+		__end_gate_mckinley_e9_patchlist = .;
 
-		__paravirt_start_gate_vtop_patchlist = .;
+		__start_gate_vtop_patchlist = .;
 		*(.data..patch.vtop)
-		__paravirt_end_gate_vtop_patchlist = .;
+		__end_gate_vtop_patchlist = .;
 
-		__paravirt_start_gate_fsyscall_patchlist = .;
+		__start_gate_fsyscall_patchlist = .;
 		*(.data..patch.fsyscall_table)
-		__paravirt_end_gate_fsyscall_patchlist = .;
+		__end_gate_fsyscall_patchlist = .;
 
-		__paravirt_start_gate_brl_fsys_bubble_down_patchlist = .;
+		__start_gate_brl_fsys_bubble_down_patchlist = .;
 		*(.data..patch.brl_fsys_bubble_down)
-		__paravirt_end_gate_brl_fsys_bubble_down_patchlist = .;
+		__end_gate_brl_fsys_bubble_down_patchlist = .;
 	}						:readable
 
 	.IA_64.unwind_info	: { *(.IA_64.unwind_info*) }
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index a4acddad0c78..bb748c596443 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -26,7 +26,6 @@
 #include <asm/mmu_context.h>
 #include <asm/asm-offsets.h>
 #include <asm/pal.h>
-#include <asm/paravirt.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 #include <asm/ptrace.h>
@@ -394,41 +393,6 @@ start_ap:
 	;;
 (isBP)	st8 [r2]=r28		// save the address of the boot param area passed by the bootloader
 
-#ifdef CONFIG_PARAVIRT
-
-	movl r14=hypervisor_setup_hooks
-	movl r15=hypervisor_type
-	mov r16=num_hypervisor_hooks
-	;;
-	ld8 r2=[r15]
-	;;
-	cmp.ltu p7,p0=r2,r16	// array size check
-	shladd r8=r2,3,r14
-	;;
-(p7)	ld8 r9=[r8]
-	;;
-(p7)	mov b1=r9
-(p7)	cmp.ne.unc p7,p0=r9,r0	// no actual branch to NULL
-	;;
-(p7)	br.call.sptk.many rp=b1
-
-	__INITDATA
-
-default_setup_hook = 0		// Currently nothing needs to be done.
-
-	.global hypervisor_type
-hypervisor_type:
-	data8		PARAVIRT_HYPERVISOR_TYPE_DEFAULT
-
-	// must have the same order with PARAVIRT_HYPERVISOR_TYPE_xxx
-
-hypervisor_setup_hooks:
-	data8		default_setup_hook
-num_hypervisor_hooks = (. - hypervisor_setup_hooks) / 8
-	.previous
-
-#endif
-
 #ifdef CONFIG_SMP
 (isAP)	br.call.sptk.many rp=start_secondary
 .ret0:
@@ -1063,12 +1027,6 @@ GLOBAL_ENTRY(ia64_native_sched_clock)
 	shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
 	br.ret.sptk.many rp
 END(ia64_native_sched_clock)
-#ifndef CONFIG_PARAVIRT
-	//unsigned long long
-	//sched_clock(void) __attribute__((alias("ia64_native_sched_clock")));
-	.global sched_clock
-sched_clock = ia64_native_sched_clock
-#endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 GLOBAL_ENTRY(cycle_to_cputime)
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index e42bf7a913f3..b1c3cfc93e71 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -937,7 +937,6 @@ END(interrupt)
 	 *	- ar.fpsr: set to kernel settings
 	 *	-  b6: preserved (same as on entry)
 	 */
-#ifdef __IA64_ASM_PARAVIRTUALIZED_NATIVE
 GLOBAL_ENTRY(ia64_syscall_setup)
 #if PT(B6) != 0
 # error This code assumes that b6 is the first field in pt_regs.
@@ -1029,7 +1028,6 @@ GLOBAL_ENTRY(ia64_syscall_setup)
 (p10)	mov r8=-EINVAL
 	br.ret.sptk.many b7
 END(ia64_syscall_setup)
-#endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
 
 	.org ia64_ivt+0x3c00
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -1043,7 +1041,7 @@ END(ia64_syscall_setup)
 	DBG_FAULT(16)
 	FAULT(16)
 
-#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(__IA64_ASM_PARAVIRTUALIZED_NATIVE)
+#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)
 	/*
 	 * There is no particular reason for this code to be here, other than
 	 * that there happens to be space here that would go unused otherwise.
diff --git a/arch/ia64/kernel/minstate.h b/arch/ia64/kernel/minstate.h
index cc82a7d744c9..5704700fb703 100644
--- a/arch/ia64/kernel/minstate.h
+++ b/arch/ia64/kernel/minstate.h
@@ -2,7 +2,7 @@
 #include <asm/cache.h>
 
 #include "entry.h"
-#include "paravirt_inst.h"
+#include <asm/native/inst.h>
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /* read ar.itc in advance, and use it before leaving bank 0 */
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 29754aae5177..b15933c31b2f 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -439,14 +439,6 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
 			mod->arch.opd = s;
 		else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
 			mod->arch.unwind = s;
-#ifdef CONFIG_PARAVIRT
-		else if (strcmp(".paravirt_bundles",
-				secstrings + s->sh_name) == 0)
-			mod->arch.paravirt_bundles = s;
-		else if (strcmp(".paravirt_insts",
-				secstrings + s->sh_name) == 0)
-			mod->arch.paravirt_insts = s;
-#endif
 
 	if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
 		printk(KERN_ERR "%s: sections missing\n", mod->name);
@@ -914,30 +906,6 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo
 	DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
 	if (mod->arch.unwind)
 		register_unwind_table(mod);
-#ifdef CONFIG_PARAVIRT
-        if (mod->arch.paravirt_bundles) {
-                struct paravirt_patch_site_bundle *start =
-                        (struct paravirt_patch_site_bundle *)
-                        mod->arch.paravirt_bundles->sh_addr;
-                struct paravirt_patch_site_bundle *end =
-                        (struct paravirt_patch_site_bundle *)
-                        (mod->arch.paravirt_bundles->sh_addr +
-                         mod->arch.paravirt_bundles->sh_size);
-
-                paravirt_patch_apply_bundle(start, end);
-        }
-        if (mod->arch.paravirt_insts) {
-                struct paravirt_patch_site_inst *start =
-                        (struct paravirt_patch_site_inst *)
-                        mod->arch.paravirt_insts->sh_addr;
-                struct paravirt_patch_site_inst *end =
-                        (struct paravirt_patch_site_inst *)
-                        (mod->arch.paravirt_insts->sh_addr +
-                         mod->arch.paravirt_insts->sh_size);
-
-                paravirt_patch_apply_inst(start, end);
-        }
-#endif
 	return 0;
 }
 
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 9dd7464f8c17..d70bf15c690a 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = {
 	.irq_retrigger = ia64_msi_retrigger_irq,
 };
 
-static int
+static void
 msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
@@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
-	return 0;
 }
 
-int arch_setup_dmar_msi(unsigned int irq)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
-	int ret;
+	int irq;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq = create_irq();
+	if (irq > 0) {
+		irq_set_handler_data(irq, arg);
+		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
+					      handle_edge_irq, "edge");
+		msi_compose_msg(NULL, irq, &msg);
+		dmar_msi_write(irq, &msg);
+	}
+
+	return irq;
+}
+
+void dmar_free_hwirq(int irq)
+{
+	irq_set_handler_data(irq, NULL);
+	destroy_irq(irq);
 }
 #endif /* CONFIG_INTEL_IOMMU */
 
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
deleted file mode 100644
index 1b22f6de2932..000000000000
--- a/arch/ia64/kernel/paravirt.c
+++ /dev/null
@@ -1,902 +0,0 @@
-/******************************************************************************
- * arch/ia64/kernel/paravirt.c
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *     Yaozu (Eddie) Dong <eddie.dong@intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <linux/init.h>
-
-#include <linux/compiler.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/types.h>
-
-#include <asm/iosapic.h>
-#include <asm/paravirt.h>
-
-/***************************************************************************
- * general info
- */
-struct pv_info pv_info = {
-	.kernel_rpl = 0,
-	.paravirt_enabled = 0,
-	.name = "bare hardware"
-};
-
-/***************************************************************************
- * pv_init_ops
- * initialization hooks.
- */
-
-static void __init
-ia64_native_patch_branch(unsigned long tag, unsigned long type);
-
-struct pv_init_ops pv_init_ops =
-{
-#ifdef ASM_SUPPORTED
-	.patch_bundle = ia64_native_patch_bundle,
-#endif
-	.patch_branch = ia64_native_patch_branch,
-};
-
-/***************************************************************************
- * pv_cpu_ops
- * intrinsics hooks.
- */
-
-#ifndef ASM_SUPPORTED
-/* ia64_native_xxx are macros so that we have to make them real functions */
-
-#define DEFINE_VOID_FUNC1(name)					\
-	static void						\
-	ia64_native_ ## name ## _func(unsigned long arg)	\
-	{							\
-		ia64_native_ ## name(arg);			\
-	}
-
-#define DEFINE_VOID_FUNC1_VOID(name)				\
-	static void						\
-	ia64_native_ ## name ## _func(void *arg)		\
-	{							\
-		ia64_native_ ## name(arg);			\
-	}
-
-#define DEFINE_VOID_FUNC2(name)					\
-	static void						\
-	ia64_native_ ## name ## _func(unsigned long arg0,	\
-				      unsigned long arg1)	\
-	{							\
-		ia64_native_ ## name(arg0, arg1);		\
-	}
-
-#define DEFINE_FUNC0(name)			\
-	static unsigned long			\
-	ia64_native_ ## name ## _func(void)	\
-	{					\
-		return ia64_native_ ## name();	\
-	}
-
-#define DEFINE_FUNC1(name, type)			\
-	static unsigned long				\
-	ia64_native_ ## name ## _func(type arg)		\
-	{						\
-		return ia64_native_ ## name(arg);	\
-	}						\
-
-DEFINE_VOID_FUNC1_VOID(fc);
-DEFINE_VOID_FUNC1(intrin_local_irq_restore);
-
-DEFINE_VOID_FUNC2(ptcga);
-DEFINE_VOID_FUNC2(set_rr);
-
-DEFINE_FUNC0(get_psr_i);
-
-DEFINE_FUNC1(thash, unsigned long);
-DEFINE_FUNC1(get_cpuid, int);
-DEFINE_FUNC1(get_pmd, int);
-DEFINE_FUNC1(get_rr, unsigned long);
-
-static void
-ia64_native_ssm_i_func(void)
-{
-	ia64_native_ssm(IA64_PSR_I);
-}
-
-static void
-ia64_native_rsm_i_func(void)
-{
-	ia64_native_rsm(IA64_PSR_I);
-}
-
-static void
-ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
-				unsigned long val2, unsigned long val3,
-				unsigned long val4)
-{
-	ia64_native_set_rr0_to_rr4(val0, val1, val2, val3, val4);
-}
-
-#define CASE_GET_REG(id)				\
-	case _IA64_REG_ ## id:				\
-	res = ia64_native_getreg(_IA64_REG_ ## id);	\
-	break;
-#define CASE_GET_AR(id) CASE_GET_REG(AR_ ## id)
-#define CASE_GET_CR(id) CASE_GET_REG(CR_ ## id)
-
-unsigned long
-ia64_native_getreg_func(int regnum)
-{
-	unsigned long res = -1;
-	switch (regnum) {
-	CASE_GET_REG(GP);
-	/*CASE_GET_REG(IP);*/ /* returned ip value shouldn't be constant */
-	CASE_GET_REG(PSR);
-	CASE_GET_REG(TP);
-	CASE_GET_REG(SP);
-
-	CASE_GET_AR(KR0);
-	CASE_GET_AR(KR1);
-	CASE_GET_AR(KR2);
-	CASE_GET_AR(KR3);
-	CASE_GET_AR(KR4);
-	CASE_GET_AR(KR5);
-	CASE_GET_AR(KR6);
-	CASE_GET_AR(KR7);
-	CASE_GET_AR(RSC);
-	CASE_GET_AR(BSP);
-	CASE_GET_AR(BSPSTORE);
-	CASE_GET_AR(RNAT);
-	CASE_GET_AR(FCR);
-	CASE_GET_AR(EFLAG);
-	CASE_GET_AR(CSD);
-	CASE_GET_AR(SSD);
-	CASE_GET_AR(CFLAG);
-	CASE_GET_AR(FSR);
-	CASE_GET_AR(FIR);
-	CASE_GET_AR(FDR);
-	CASE_GET_AR(CCV);
-	CASE_GET_AR(UNAT);
-	CASE_GET_AR(FPSR);
-	CASE_GET_AR(ITC);
-	CASE_GET_AR(PFS);
-	CASE_GET_AR(LC);
-	CASE_GET_AR(EC);
-
-	CASE_GET_CR(DCR);
-	CASE_GET_CR(ITM);
-	CASE_GET_CR(IVA);
-	CASE_GET_CR(PTA);
-	CASE_GET_CR(IPSR);
-	CASE_GET_CR(ISR);
-	CASE_GET_CR(IIP);
-	CASE_GET_CR(IFA);
-	CASE_GET_CR(ITIR);
-	CASE_GET_CR(IIPA);
-	CASE_GET_CR(IFS);
-	CASE_GET_CR(IIM);
-	CASE_GET_CR(IHA);
-	CASE_GET_CR(LID);
-	CASE_GET_CR(IVR);
-	CASE_GET_CR(TPR);
-	CASE_GET_CR(EOI);
-	CASE_GET_CR(IRR0);
-	CASE_GET_CR(IRR1);
-	CASE_GET_CR(IRR2);
-	CASE_GET_CR(IRR3);
-	CASE_GET_CR(ITV);
-	CASE_GET_CR(PMV);
-	CASE_GET_CR(CMCV);
-	CASE_GET_CR(LRR0);
-	CASE_GET_CR(LRR1);
-
-	default:
-		printk(KERN_CRIT "wrong_getreg %d\n", regnum);
-		break;
-	}
-	return res;
-}
-
-#define CASE_SET_REG(id)				\
-	case _IA64_REG_ ## id:				\
-	ia64_native_setreg(_IA64_REG_ ## id, val);	\
-	break;
-#define CASE_SET_AR(id) CASE_SET_REG(AR_ ## id)
-#define CASE_SET_CR(id) CASE_SET_REG(CR_ ## id)
-
-void
-ia64_native_setreg_func(int regnum, unsigned long val)
-{
-	switch (regnum) {
-	case _IA64_REG_PSR_L:
-		ia64_native_setreg(_IA64_REG_PSR_L, val);
-		ia64_dv_serialize_data();
-		break;
-	CASE_SET_REG(SP);
-	CASE_SET_REG(GP);
-
-	CASE_SET_AR(KR0);
-	CASE_SET_AR(KR1);
-	CASE_SET_AR(KR2);
-	CASE_SET_AR(KR3);
-	CASE_SET_AR(KR4);
-	CASE_SET_AR(KR5);
-	CASE_SET_AR(KR6);
-	CASE_SET_AR(KR7);
-	CASE_SET_AR(RSC);
-	CASE_SET_AR(BSP);
-	CASE_SET_AR(BSPSTORE);
-	CASE_SET_AR(RNAT);
-	CASE_SET_AR(FCR);
-	CASE_SET_AR(EFLAG);
-	CASE_SET_AR(CSD);
-	CASE_SET_AR(SSD);
-	CASE_SET_AR(CFLAG);
-	CASE_SET_AR(FSR);
-	CASE_SET_AR(FIR);
-	CASE_SET_AR(FDR);
-	CASE_SET_AR(CCV);
-	CASE_SET_AR(UNAT);
-	CASE_SET_AR(FPSR);
-	CASE_SET_AR(ITC);
-	CASE_SET_AR(PFS);
-	CASE_SET_AR(LC);
-	CASE_SET_AR(EC);
-
-	CASE_SET_CR(DCR);
-	CASE_SET_CR(ITM);
-	CASE_SET_CR(IVA);
-	CASE_SET_CR(PTA);
-	CASE_SET_CR(IPSR);
-	CASE_SET_CR(ISR);
-	CASE_SET_CR(IIP);
-	CASE_SET_CR(IFA);
-	CASE_SET_CR(ITIR);
-	CASE_SET_CR(IIPA);
-	CASE_SET_CR(IFS);
-	CASE_SET_CR(IIM);
-	CASE_SET_CR(IHA);
-	CASE_SET_CR(LID);
-	CASE_SET_CR(IVR);
-	CASE_SET_CR(TPR);
-	CASE_SET_CR(EOI);
-	CASE_SET_CR(IRR0);
-	CASE_SET_CR(IRR1);
-	CASE_SET_CR(IRR2);
-	CASE_SET_CR(IRR3);
-	CASE_SET_CR(ITV);
-	CASE_SET_CR(PMV);
-	CASE_SET_CR(CMCV);
-	CASE_SET_CR(LRR0);
-	CASE_SET_CR(LRR1);
-	default:
-		printk(KERN_CRIT "wrong setreg %d\n", regnum);
-		break;
-	}
-}
-#else
-
-#define __DEFINE_FUNC(name, code)					\
-	extern const char ia64_native_ ## name ## _direct_start[];	\
-	extern const char ia64_native_ ## name ## _direct_end[];	\
-	asm (".align 32\n"						\
-	     ".proc ia64_native_" #name "_func\n"			\
-	     "ia64_native_" #name "_func:\n"				\
-	     "ia64_native_" #name "_direct_start:\n"			\
-	     code							\
-	     "ia64_native_" #name "_direct_end:\n"			\
-	     "br.cond.sptk.many b6\n"					\
-	     ".endp ia64_native_" #name "_func\n")
-
-#define DEFINE_VOID_FUNC0(name, code)				\
-	extern void						\
-	ia64_native_ ## name ## _func(void);			\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC1(name, code)				\
-	extern void						\
-	ia64_native_ ## name ## _func(unsigned long arg);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC1_VOID(name, code)			\
-	extern void						\
-	ia64_native_ ## name ## _func(void *arg);		\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_VOID_FUNC2(name, code)				\
-	extern void						\
-	ia64_native_ ## name ## _func(unsigned long arg0,	\
-				      unsigned long arg1);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_FUNC0(name, code)		\
-	extern unsigned long			\
-	ia64_native_ ## name ## _func(void);	\
-	__DEFINE_FUNC(name, code)
-
-#define DEFINE_FUNC1(name, type, code)			\
-	extern unsigned long				\
-	ia64_native_ ## name ## _func(type arg);	\
-	__DEFINE_FUNC(name, code)
-
-DEFINE_VOID_FUNC1_VOID(fc,
-		       "fc r8\n");
-DEFINE_VOID_FUNC1(intrin_local_irq_restore,
-		  ";;\n"
-		  "     cmp.ne p6, p7 = r8, r0\n"
-		  ";;\n"
-		  "(p6) ssm psr.i\n"
-		  "(p7) rsm psr.i\n"
-		  ";;\n"
-		  "(p6) srlz.d\n");
-
-DEFINE_VOID_FUNC2(ptcga,
-		  "ptc.ga r8, r9\n");
-DEFINE_VOID_FUNC2(set_rr,
-		  "mov rr[r8] = r9\n");
-
-/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */
-DEFINE_FUNC0(get_psr_i,
-	     "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n"
-	     "mov r8 = psr\n"
-	     ";;\n"
-	     "and r8 = r2, r8\n");
-
-DEFINE_FUNC1(thash, unsigned long,
-	     "thash r8 = r8\n");
-DEFINE_FUNC1(get_cpuid, int,
-	     "mov r8 = cpuid[r8]\n");
-DEFINE_FUNC1(get_pmd, int,
-	     "mov r8 = pmd[r8]\n");
-DEFINE_FUNC1(get_rr, unsigned long,
-	     "mov r8 = rr[r8]\n");
-
-DEFINE_VOID_FUNC0(ssm_i,
-		  "ssm psr.i\n");
-DEFINE_VOID_FUNC0(rsm_i,
-		  "rsm psr.i\n");
-
-extern void
-ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
-				unsigned long val2, unsigned long val3,
-				unsigned long val4);
-__DEFINE_FUNC(set_rr0_to_rr4,
-	      "mov rr[r0] = r8\n"
-	      "movl r2 = 0x2000000000000000\n"
-	      ";;\n"
-	      "mov rr[r2] = r9\n"
-	      "shl r3 = r2, 1\n"	/* movl r3 = 0x4000000000000000 */
-	      ";;\n"
-	      "add r2 = r2, r3\n"	/* movl r2 = 0x6000000000000000 */
-	      "mov rr[r3] = r10\n"
-	      ";;\n"
-	      "mov rr[r2] = r11\n"
-	      "shl r3 = r3, 1\n"	/* movl r3 = 0x8000000000000000 */
-	      ";;\n"
-	      "mov rr[r3] = r14\n");
-
-extern unsigned long ia64_native_getreg_func(int regnum);
-asm(".global ia64_native_getreg_func\n");
-#define __DEFINE_GET_REG(id, reg)			\
-	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
-	";;\n"						\
-	"cmp.eq p6, p0 = r2, r8\n"			\
-	";;\n"						\
-	"(p6) mov r8 = " #reg "\n"			\
-	"(p6) br.cond.sptk.many b6\n"			\
-	";;\n"
-#define __DEFINE_GET_AR(id, reg)	__DEFINE_GET_REG(AR_ ## id, ar.reg)
-#define __DEFINE_GET_CR(id, reg)	__DEFINE_GET_REG(CR_ ## id, cr.reg)
-
-__DEFINE_FUNC(getreg,
-	      __DEFINE_GET_REG(GP, gp)
-	      /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */
-	      __DEFINE_GET_REG(PSR, psr)
-	      __DEFINE_GET_REG(TP, tp)
-	      __DEFINE_GET_REG(SP, sp)
-
-	      __DEFINE_GET_REG(AR_KR0, ar0)
-	      __DEFINE_GET_REG(AR_KR1, ar1)
-	      __DEFINE_GET_REG(AR_KR2, ar2)
-	      __DEFINE_GET_REG(AR_KR3, ar3)
-	      __DEFINE_GET_REG(AR_KR4, ar4)
-	      __DEFINE_GET_REG(AR_KR5, ar5)
-	      __DEFINE_GET_REG(AR_KR6, ar6)
-	      __DEFINE_GET_REG(AR_KR7, ar7)
-	      __DEFINE_GET_AR(RSC, rsc)
-	      __DEFINE_GET_AR(BSP, bsp)
-	      __DEFINE_GET_AR(BSPSTORE, bspstore)
-	      __DEFINE_GET_AR(RNAT, rnat)
-	      __DEFINE_GET_AR(FCR, fcr)
-	      __DEFINE_GET_AR(EFLAG, eflag)
-	      __DEFINE_GET_AR(CSD, csd)
-	      __DEFINE_GET_AR(SSD, ssd)
-	      __DEFINE_GET_REG(AR_CFLAG, ar27)
-	      __DEFINE_GET_AR(FSR, fsr)
-	      __DEFINE_GET_AR(FIR, fir)
-	      __DEFINE_GET_AR(FDR, fdr)
-	      __DEFINE_GET_AR(CCV, ccv)
-	      __DEFINE_GET_AR(UNAT, unat)
-	      __DEFINE_GET_AR(FPSR, fpsr)
-	      __DEFINE_GET_AR(ITC, itc)
-	      __DEFINE_GET_AR(PFS, pfs)
-	      __DEFINE_GET_AR(LC, lc)
-	      __DEFINE_GET_AR(EC, ec)
-
-	      __DEFINE_GET_CR(DCR, dcr)
-	      __DEFINE_GET_CR(ITM, itm)
-	      __DEFINE_GET_CR(IVA, iva)
-	      __DEFINE_GET_CR(PTA, pta)
-	      __DEFINE_GET_CR(IPSR, ipsr)
-	      __DEFINE_GET_CR(ISR, isr)
-	      __DEFINE_GET_CR(IIP, iip)
-	      __DEFINE_GET_CR(IFA, ifa)
-	      __DEFINE_GET_CR(ITIR, itir)
-	      __DEFINE_GET_CR(IIPA, iipa)
-	      __DEFINE_GET_CR(IFS, ifs)
-	      __DEFINE_GET_CR(IIM, iim)
-	      __DEFINE_GET_CR(IHA, iha)
-	      __DEFINE_GET_CR(LID, lid)
-	      __DEFINE_GET_CR(IVR, ivr)
-	      __DEFINE_GET_CR(TPR, tpr)
-	      __DEFINE_GET_CR(EOI, eoi)
-	      __DEFINE_GET_CR(IRR0, irr0)
-	      __DEFINE_GET_CR(IRR1, irr1)
-	      __DEFINE_GET_CR(IRR2, irr2)
-	      __DEFINE_GET_CR(IRR3, irr3)
-	      __DEFINE_GET_CR(ITV, itv)
-	      __DEFINE_GET_CR(PMV, pmv)
-	      __DEFINE_GET_CR(CMCV, cmcv)
-	      __DEFINE_GET_CR(LRR0, lrr0)
-	      __DEFINE_GET_CR(LRR1, lrr1)
-
-	      "mov r8 = -1\n"	/* unsupported case */
-	);
-
-extern void ia64_native_setreg_func(int regnum, unsigned long val);
-asm(".global ia64_native_setreg_func\n");
-#define __DEFINE_SET_REG(id, reg)			\
-	"mov r2 = " __stringify(_IA64_REG_ ## id) "\n"	\
-	";;\n"						\
-	"cmp.eq p6, p0 = r2, r9\n"			\
-	";;\n"						\
-	"(p6) mov " #reg " = r8\n"			\
-	"(p6) br.cond.sptk.many b6\n"			\
-	";;\n"
-#define __DEFINE_SET_AR(id, reg)	__DEFINE_SET_REG(AR_ ## id, ar.reg)
-#define __DEFINE_SET_CR(id, reg)	__DEFINE_SET_REG(CR_ ## id, cr.reg)
-__DEFINE_FUNC(setreg,
-	      "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n"
-	      ";;\n"
-	      "cmp.eq p6, p0 = r2, r9\n"
-	      ";;\n"
-	      "(p6) mov psr.l = r8\n"
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-	      ".serialize.data\n"
-#endif
-	      "(p6) br.cond.sptk.many b6\n"
-	      __DEFINE_SET_REG(GP, gp)
-	      __DEFINE_SET_REG(SP, sp)
-
-	      __DEFINE_SET_REG(AR_KR0, ar0)
-	      __DEFINE_SET_REG(AR_KR1, ar1)
-	      __DEFINE_SET_REG(AR_KR2, ar2)
-	      __DEFINE_SET_REG(AR_KR3, ar3)
-	      __DEFINE_SET_REG(AR_KR4, ar4)
-	      __DEFINE_SET_REG(AR_KR5, ar5)
-	      __DEFINE_SET_REG(AR_KR6, ar6)
-	      __DEFINE_SET_REG(AR_KR7, ar7)
-	      __DEFINE_SET_AR(RSC, rsc)
-	      __DEFINE_SET_AR(BSP, bsp)
-	      __DEFINE_SET_AR(BSPSTORE, bspstore)
-	      __DEFINE_SET_AR(RNAT, rnat)
-	      __DEFINE_SET_AR(FCR, fcr)
-	      __DEFINE_SET_AR(EFLAG, eflag)
-	      __DEFINE_SET_AR(CSD, csd)
-	      __DEFINE_SET_AR(SSD, ssd)
-	      __DEFINE_SET_REG(AR_CFLAG, ar27)
-	      __DEFINE_SET_AR(FSR, fsr)
-	      __DEFINE_SET_AR(FIR, fir)
-	      __DEFINE_SET_AR(FDR, fdr)
-	      __DEFINE_SET_AR(CCV, ccv)
-	      __DEFINE_SET_AR(UNAT, unat)
-	      __DEFINE_SET_AR(FPSR, fpsr)
-	      __DEFINE_SET_AR(ITC, itc)
-	      __DEFINE_SET_AR(PFS, pfs)
-	      __DEFINE_SET_AR(LC, lc)
-	      __DEFINE_SET_AR(EC, ec)
-
-	      __DEFINE_SET_CR(DCR, dcr)
-	      __DEFINE_SET_CR(ITM, itm)
-	      __DEFINE_SET_CR(IVA, iva)
-	      __DEFINE_SET_CR(PTA, pta)
-	      __DEFINE_SET_CR(IPSR, ipsr)
-	      __DEFINE_SET_CR(ISR, isr)
-	      __DEFINE_SET_CR(IIP, iip)
-	      __DEFINE_SET_CR(IFA, ifa)
-	      __DEFINE_SET_CR(ITIR, itir)
-	      __DEFINE_SET_CR(IIPA, iipa)
-	      __DEFINE_SET_CR(IFS, ifs)
-	      __DEFINE_SET_CR(IIM, iim)
-	      __DEFINE_SET_CR(IHA, iha)
-	      __DEFINE_SET_CR(LID, lid)
-	      __DEFINE_SET_CR(IVR, ivr)
-	      __DEFINE_SET_CR(TPR, tpr)
-	      __DEFINE_SET_CR(EOI, eoi)
-	      __DEFINE_SET_CR(IRR0, irr0)
-	      __DEFINE_SET_CR(IRR1, irr1)
-	      __DEFINE_SET_CR(IRR2, irr2)
-	      __DEFINE_SET_CR(IRR3, irr3)
-	      __DEFINE_SET_CR(ITV, itv)
-	      __DEFINE_SET_CR(PMV, pmv)
-	      __DEFINE_SET_CR(CMCV, cmcv)
-	      __DEFINE_SET_CR(LRR0, lrr0)
-	      __DEFINE_SET_CR(LRR1, lrr1)
-	);
-#endif
-
-struct pv_cpu_ops pv_cpu_ops = {
-	.fc		= ia64_native_fc_func,
-	.thash		= ia64_native_thash_func,
-	.get_cpuid	= ia64_native_get_cpuid_func,
-	.get_pmd	= ia64_native_get_pmd_func,
-	.ptcga		= ia64_native_ptcga_func,
-	.get_rr		= ia64_native_get_rr_func,
-	.set_rr		= ia64_native_set_rr_func,
-	.set_rr0_to_rr4	= ia64_native_set_rr0_to_rr4_func,
-	.ssm_i		= ia64_native_ssm_i_func,
-	.getreg		= ia64_native_getreg_func,
-	.setreg		= ia64_native_setreg_func,
-	.rsm_i		= ia64_native_rsm_i_func,
-	.get_psr_i	= ia64_native_get_psr_i_func,
-	.intrin_local_irq_restore
-			= ia64_native_intrin_local_irq_restore_func,
-};
-EXPORT_SYMBOL(pv_cpu_ops);
-
-/******************************************************************************
- * replacement of hand written assembly codes.
- */
-
-void
-paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch)
-{
-	extern unsigned long paravirt_switch_to_targ;
-	extern unsigned long paravirt_leave_syscall_targ;
-	extern unsigned long paravirt_work_processed_syscall_targ;
-	extern unsigned long paravirt_leave_kernel_targ;
-
-	paravirt_switch_to_targ = cpu_asm_switch->switch_to;
-	paravirt_leave_syscall_targ = cpu_asm_switch->leave_syscall;
-	paravirt_work_processed_syscall_targ =
-		cpu_asm_switch->work_processed_syscall;
-	paravirt_leave_kernel_targ = cpu_asm_switch->leave_kernel;
-}
-
-/***************************************************************************
- * pv_iosapic_ops
- * iosapic read/write hooks.
- */
-
-static unsigned int
-ia64_native_iosapic_read(char __iomem *iosapic, unsigned int reg)
-{
-	return __ia64_native_iosapic_read(iosapic, reg);
-}
-
-static void
-ia64_native_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
-{
-	__ia64_native_iosapic_write(iosapic, reg, val);
-}
-
-struct pv_iosapic_ops pv_iosapic_ops = {
-	.pcat_compat_init = ia64_native_iosapic_pcat_compat_init,
-	.__get_irq_chip = ia64_native_iosapic_get_irq_chip,
-
-	.__read = ia64_native_iosapic_read,
-	.__write = ia64_native_iosapic_write,
-};
-
-/***************************************************************************
- * pv_irq_ops
- * irq operations
- */
-
-struct pv_irq_ops pv_irq_ops = {
-	.register_ipi = ia64_native_register_ipi,
-
-	.assign_irq_vector = ia64_native_assign_irq_vector,
-	.free_irq_vector = ia64_native_free_irq_vector,
-	.register_percpu_irq = ia64_native_register_percpu_irq,
-
-	.resend_irq = ia64_native_resend_irq,
-};
-
-/***************************************************************************
- * pv_time_ops
- * time operations
- */
-struct static_key paravirt_steal_enabled;
-struct static_key paravirt_steal_rq_enabled;
-
-static int
-ia64_native_do_steal_accounting(unsigned long *new_itm)
-{
-	return 0;
-}
-
-struct pv_time_ops pv_time_ops = {
-	.do_steal_accounting = ia64_native_do_steal_accounting,
-	.sched_clock = ia64_native_sched_clock,
-};
-
-/***************************************************************************
- * binary pacthing
- * pv_init_ops.patch_bundle
- */
-
-#ifdef ASM_SUPPORTED
-#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg)	\
-	__DEFINE_FUNC(get_ ## name,			\
-		      ";;\n"				\
-		      "mov r8 = " #reg "\n"		\
-		      ";;\n")
-
-#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
-	__DEFINE_FUNC(set_ ## name,			\
-		      ";;\n"				\
-		      "mov " #reg " = r8\n"		\
-		      ";;\n")
-
-#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg)		\
-	IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg);	\
-	IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg)	\
-
-#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg)			\
-	IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg)
-
-#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg)			\
-	IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg)
-
-
-IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr);
-IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp);
-
-/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */
-__DEFINE_FUNC(set_psr_l,
-	      ";;\n"
-	      "mov psr.l = r8\n"
-#ifdef HAVE_SERIALIZE_DIRECTIVE
-	      ".serialize.data\n"
-#endif
-	      ";;\n");
-
-IA64_NATIVE_PATCH_DEFINE_REG(gp, gp);
-IA64_NATIVE_PATCH_DEFINE_REG(sp, sp);
-
-IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0);
-IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1);
-IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2);
-IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3);
-IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4);
-IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5);
-IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6);
-IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7);
-
-IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc);
-IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp);
-IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore);
-IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat);
-IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr);
-IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag);
-IA64_NATIVE_PATCH_DEFINE_AR(csd, csd);
-IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd);
-IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27);
-IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr);
-IA64_NATIVE_PATCH_DEFINE_AR(fir, fir);
-IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr);
-IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv);
-IA64_NATIVE_PATCH_DEFINE_AR(unat, unat);
-IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr);
-IA64_NATIVE_PATCH_DEFINE_AR(itc, itc);
-IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs);
-IA64_NATIVE_PATCH_DEFINE_AR(lc, lc);
-IA64_NATIVE_PATCH_DEFINE_AR(ec, ec);
-
-IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr);
-IA64_NATIVE_PATCH_DEFINE_CR(itm, itm);
-IA64_NATIVE_PATCH_DEFINE_CR(iva, iva);
-IA64_NATIVE_PATCH_DEFINE_CR(pta, pta);
-IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr);
-IA64_NATIVE_PATCH_DEFINE_CR(isr, isr);
-IA64_NATIVE_PATCH_DEFINE_CR(iip, iip);
-IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa);
-IA64_NATIVE_PATCH_DEFINE_CR(itir, itir);
-IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa);
-IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs);
-IA64_NATIVE_PATCH_DEFINE_CR(iim, iim);
-IA64_NATIVE_PATCH_DEFINE_CR(iha, iha);
-IA64_NATIVE_PATCH_DEFINE_CR(lid, lid);
-IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr);
-IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr);
-IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi);
-IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0);
-IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1);
-IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2);
-IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3);
-IA64_NATIVE_PATCH_DEFINE_CR(itv, itv);
-IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv);
-IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv);
-IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0);
-IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1);
-
-static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[]
-__initdata_or_module =
-{
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type)		\
-	{							\
-		(void*)ia64_native_ ## name ## _direct_start,	\
-		(void*)ia64_native_ ## name ## _direct_end,	\
-		PARAVIRT_PATCH_TYPE_ ## type,			\
-	}
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore,
-				      INTRIN_LOCAL_IRQ_RESTORE),
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg)			\
-	{								\
-		(void*)ia64_native_get_ ## name ## _direct_start,	\
-		(void*)ia64_native_get_ ## name ## _direct_end,		\
-		PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg,		\
-	}
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)			\
-	{								\
-		(void*)ia64_native_set_ ## name ## _direct_start,	\
-		(void*)ia64_native_set_ ## name ## _direct_end,		\
-		PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg,		\
-	}
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg)		\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg),	\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg)		\
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg)		\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg)
-
-#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg)		\
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg)
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC),
-
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0),
-	IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1),
-};
-
-unsigned long __init_or_module
-ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
-{
-	const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) /
-		sizeof(ia64_native_patch_bundle_elems[0]);
-
-	return __paravirt_patch_apply_bundle(sbundle, ebundle, type,
-					      ia64_native_patch_bundle_elems,
-					      nelems, NULL);
-}
-#endif /* ASM_SUPPOTED */
-
-extern const char ia64_native_switch_to[];
-extern const char ia64_native_leave_syscall[];
-extern const char ia64_native_work_processed_syscall[];
-extern const char ia64_native_leave_kernel[];
-
-const struct paravirt_patch_branch_target ia64_native_branch_target[]
-__initconst = {
-#define PARAVIRT_BR_TARGET(name, type)			\
-	{						\
-		ia64_native_ ## name,			\
-		PARAVIRT_PATCH_TYPE_BR_ ## type,	\
-	}
-	PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
-	PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
-	PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
-	PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
-};
-
-static void __init
-ia64_native_patch_branch(unsigned long tag, unsigned long type)
-{
-	const unsigned long nelem =
-		sizeof(ia64_native_branch_target) /
-		sizeof(ia64_native_branch_target[0]);
-	__paravirt_patch_apply_branch(tag, type,
-				      ia64_native_branch_target, nelem);
-}
diff --git a/arch/ia64/kernel/paravirt_inst.h b/arch/ia64/kernel/paravirt_inst.h
deleted file mode 100644
index 1ad7512b5f65..000000000000
--- a/arch/ia64/kernel/paravirt_inst.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/******************************************************************************
- * linux/arch/ia64/xen/paravirt_inst.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#ifdef __IA64_ASM_PARAVIRTUALIZED_PVCHECK
-#include <asm/native/pvchk_inst.h>
-#else
-#include <asm/native/inst.h>
-#endif
-
diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c
deleted file mode 100644
index bfdfef1b1ffd..000000000000
--- a/arch/ia64/kernel/paravirt_patch.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/******************************************************************************
- * linux/arch/ia64/xen/paravirt_patch.c
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <linux/init.h>
-#include <asm/intrinsics.h>
-#include <asm/kprobes.h>
-#include <asm/paravirt.h>
-#include <asm/paravirt_patch.h>
-
-typedef union ia64_inst {
-        struct {
-		unsigned long long qp : 6;
-		unsigned long long : 31;
-		unsigned long long opcode : 4;
-		unsigned long long reserved : 23;
-        } generic;
-        unsigned long long l;
-} ia64_inst_t;
-
-/*
- * flush_icache_range() can't be used here.
- * we are here before cpu_init() which initializes
- * ia64_i_cache_stride_shift. flush_icache_range() uses it.
- */
-void __init_or_module
-paravirt_flush_i_cache_range(const void *instr, unsigned long size)
-{
-	extern void paravirt_fc_i(const void *addr);
-	unsigned long i;
-
-	for (i = 0; i < size; i += sizeof(bundle_t))
-		paravirt_fc_i(instr + i);
-}
-
-bundle_t* __init_or_module
-paravirt_get_bundle(unsigned long tag)
-{
-	return (bundle_t *)(tag & ~3UL);
-}
-
-unsigned long __init_or_module
-paravirt_get_slot(unsigned long tag)
-{
-	return tag & 3UL;
-}
-
-unsigned long __init_or_module
-paravirt_get_num_inst(unsigned long stag, unsigned long etag)
-{
-	bundle_t *sbundle = paravirt_get_bundle(stag);
-	unsigned long sslot = paravirt_get_slot(stag);
-	bundle_t *ebundle = paravirt_get_bundle(etag);
-	unsigned long eslot = paravirt_get_slot(etag);
-
-	return (ebundle - sbundle) * 3 + eslot - sslot + 1;
-}
-
-unsigned long __init_or_module
-paravirt_get_next_tag(unsigned long tag)
-{
-	unsigned long slot = paravirt_get_slot(tag);
-
-	switch (slot) {
-	case 0:
-	case 1:
-		return tag + 1;
-	case 2: {
-		bundle_t *bundle = paravirt_get_bundle(tag);
-		return (unsigned long)(bundle + 1);
-	}
-	default:
-		BUG();
-	}
-	/* NOTREACHED */
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_slot0(const bundle_t *bundle)
-{
-	ia64_inst_t inst;
-	inst.l = bundle->quad0.slot0;
-	return inst;
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_slot1(const bundle_t *bundle)
-{
-	ia64_inst_t inst;
-	inst.l = bundle->quad0.slot1_p0 |
-		((unsigned long long)bundle->quad1.slot1_p1 << 18UL);
-	return inst;
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_slot2(const bundle_t *bundle)
-{
-	ia64_inst_t inst;
-	inst.l = bundle->quad1.slot2;
-	return inst;
-}
-
-ia64_inst_t __init_or_module
-paravirt_read_inst(unsigned long tag)
-{
-	bundle_t *bundle = paravirt_get_bundle(tag);
-	unsigned long slot = paravirt_get_slot(tag);
-
-	switch (slot) {
-	case 0:
-		return paravirt_read_slot0(bundle);
-	case 1:
-		return paravirt_read_slot1(bundle);
-	case 2:
-		return paravirt_read_slot2(bundle);
-	default:
-		BUG();
-	}
-	/* NOTREACHED */
-}
-
-void __init_or_module
-paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst)
-{
-	bundle->quad0.slot0 = inst.l;
-}
-
-void __init_or_module
-paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst)
-{
-	bundle->quad0.slot1_p0 = inst.l;
-	bundle->quad1.slot1_p1 = inst.l >> 18UL;
-}
-
-void __init_or_module
-paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst)
-{
-	bundle->quad1.slot2 = inst.l;
-}
-
-void __init_or_module
-paravirt_write_inst(unsigned long tag, ia64_inst_t inst)
-{
-	bundle_t *bundle = paravirt_get_bundle(tag);
-	unsigned long slot = paravirt_get_slot(tag);
-
-	switch (slot) {
-	case 0:
-		paravirt_write_slot0(bundle, inst);
-		break;
-	case 1:
-		paravirt_write_slot1(bundle, inst);
-		break;
-	case 2:
-		paravirt_write_slot2(bundle, inst);
-		break;
-	default:
-		BUG();
-		break;
-	}
-	paravirt_flush_i_cache_range(bundle, sizeof(*bundle));
-}
-
-/* for debug */
-void
-paravirt_print_bundle(const bundle_t *bundle)
-{
-	const unsigned long *quad = (const unsigned long *)bundle;
-	ia64_inst_t slot0 = paravirt_read_slot0(bundle);
-	ia64_inst_t slot1 = paravirt_read_slot1(bundle);
-	ia64_inst_t slot2 = paravirt_read_slot2(bundle);
-
-	printk(KERN_DEBUG
-	       "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]);
-	printk(KERN_DEBUG
-	       "bundle template 0x%x\n",
-	       bundle->quad0.template);
-	printk(KERN_DEBUG
-	       "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n",
-	       (unsigned long)bundle->quad0.slot0,
-	       (unsigned long)bundle->quad0.slot1_p0,
-	       (unsigned long)bundle->quad1.slot1_p1,
-	       (unsigned long)bundle->quad1.slot2);
-	printk(KERN_DEBUG
-	       "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n",
-	       slot0.l, slot1.l, slot2.l);
-}
-
-static int noreplace_paravirt __init_or_module = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
-{
-	noreplace_paravirt = 1;
-	return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
-
-#ifdef ASM_SUPPORTED
-static void __init_or_module
-fill_nop_bundle(void *sbundle, void *ebundle)
-{
-	extern const char paravirt_nop_bundle[];
-	extern const unsigned long paravirt_nop_bundle_size;
-
-	void *bundle = sbundle;
-
-	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
-	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
-
-	while (bundle < ebundle) {
-		memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size);
-
-		bundle += paravirt_nop_bundle_size;
-	}
-}
-
-/* helper function */
-unsigned long __init_or_module
-__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
-			      const struct paravirt_patch_bundle_elem *elems,
-			      unsigned long nelems,
-			      const struct paravirt_patch_bundle_elem **found)
-{
-	unsigned long used = 0;
-	unsigned long i;
-
-	BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
-	BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
-
-	found = NULL;
-	for (i = 0; i < nelems; i++) {
-		const struct paravirt_patch_bundle_elem *p = &elems[i];
-		if (p->type == type) {
-			unsigned long need = p->ebundle - p->sbundle;
-			unsigned long room = ebundle - sbundle;
-
-			if (found != NULL)
-				*found = p;
-
-			if (room < need) {
-				/* no room to replace. skip it */
-				printk(KERN_DEBUG
-				       "the space is too small to put "
-				       "bundles. type %ld need %ld room %ld\n",
-				       type, need, room);
-				break;
-			}
-
-			used = need;
-			memcpy(sbundle, p->sbundle, used);
-			break;
-		}
-	}
-
-	return used;
-}
-
-void __init_or_module
-paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
-			    const struct paravirt_patch_site_bundle *end)
-{
-	const struct paravirt_patch_site_bundle *p;
-
-	if (noreplace_paravirt)
-		return;
-	if (pv_init_ops.patch_bundle == NULL)
-		return;
-
-	for (p = start; p < end; p++) {
-		unsigned long used;
-
-		used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle,
-						   p->type);
-		if (used == 0)
-			continue;
-
-		fill_nop_bundle(p->sbundle + used, p->ebundle);
-		paravirt_flush_i_cache_range(p->sbundle,
-					     p->ebundle - p->sbundle);
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-/*
- * nop.i, nop.m, nop.f instruction are same format.
- * but nop.b has differennt format.
- * This doesn't support nop.b for now.
- */
-static void __init_or_module
-fill_nop_inst(unsigned long stag, unsigned long etag)
-{
-	extern const bundle_t paravirt_nop_mfi_inst_bundle[];
-	unsigned long tag;
-	const ia64_inst_t nop_inst =
-		paravirt_read_slot0(paravirt_nop_mfi_inst_bundle);
-
-	for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag))
-		paravirt_write_inst(tag, nop_inst);
-}
-
-void __init_or_module
-paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
-			  const struct paravirt_patch_site_inst *end)
-{
-	const struct paravirt_patch_site_inst *p;
-
-	if (noreplace_paravirt)
-		return;
-	if (pv_init_ops.patch_inst == NULL)
-		return;
-
-	for (p = start; p < end; p++) {
-		unsigned long tag;
-		bundle_t *sbundle;
-		bundle_t *ebundle;
-
-		tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type);
-		if (tag == p->stag)
-			continue;
-
-		fill_nop_inst(tag, p->etag);
-		sbundle = paravirt_get_bundle(p->stag);
-		ebundle = paravirt_get_bundle(p->etag) + 1;
-		paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) *
-					     sizeof(bundle_t));
-	}
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-#endif /* ASM_SUPPOTED */
-
-/* brl.cond.sptk.many <target64> X3 */
-typedef union inst_x3_op {
-	ia64_inst_t inst;
-	struct {
-		unsigned long qp: 6;
-		unsigned long btyp: 3;
-		unsigned long unused: 3;
-		unsigned long p: 1;
-		unsigned long imm20b: 20;
-		unsigned long wh: 2;
-		unsigned long d: 1;
-		unsigned long i: 1;
-		unsigned long opcode: 4;
-	};
-	unsigned long l;
-} inst_x3_op_t;
-
-typedef union inst_x3_imm {
-	ia64_inst_t inst;
-	struct {
-		unsigned long unused: 2;
-		unsigned long imm39: 39;
-	};
-	unsigned long l;
-} inst_x3_imm_t;
-
-void __init_or_module
-paravirt_patch_reloc_brl(unsigned long tag, const void *target)
-{
-	unsigned long tag_op = paravirt_get_next_tag(tag);
-	unsigned long tag_imm = tag;
-	bundle_t *bundle = paravirt_get_bundle(tag);
-
-	ia64_inst_t inst_op = paravirt_read_inst(tag_op);
-	ia64_inst_t inst_imm = paravirt_read_inst(tag_imm);
-
-	inst_x3_op_t inst_x3_op = { .l = inst_op.l };
-	inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l };
-
-	unsigned long imm60 =
-		((unsigned long)target - (unsigned long)bundle) >> 4;
-
-	BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */
-	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
-
-	/* imm60[59] 1bit */
-	inst_x3_op.i = (imm60 >> 59) & 1;
-	/* imm60[19:0] 20bit */
-	inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1);
-	/* imm60[58:20] 39bit */
-	inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1);
-
-	inst_op.l = inst_x3_op.l;
-	inst_imm.l = inst_x3_imm.l;
-
-	paravirt_write_inst(tag_op, inst_op);
-	paravirt_write_inst(tag_imm, inst_imm);
-}
-
-/* br.cond.sptk.many <target25>	B1 */
-typedef union inst_b1 {
-	ia64_inst_t inst;
-	struct {
-		unsigned long qp: 6;
-		unsigned long btype: 3;
-		unsigned long unused: 3;
-		unsigned long p: 1;
-		unsigned long imm20b: 20;
-		unsigned long wh: 2;
-		unsigned long d: 1;
-		unsigned long s: 1;
-		unsigned long opcode: 4;
-	};
-	unsigned long l;
-} inst_b1_t;
-
-void __init
-paravirt_patch_reloc_br(unsigned long tag, const void *target)
-{
-	bundle_t *bundle = paravirt_get_bundle(tag);
-	ia64_inst_t inst = paravirt_read_inst(tag);
-	unsigned long target25 = (unsigned long)target - (unsigned long)bundle;
-	inst_b1_t inst_b1;
-
-	BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
-
-	inst_b1.l = inst.l;
-	if (target25 & (1UL << 63))
-		inst_b1.s = 1;
-	else
-		inst_b1.s = 0;
-
-	inst_b1.imm20b = target25 >> 4;
-	inst.l = inst_b1.l;
-
-	paravirt_write_inst(tag, inst);
-}
-
-void __init
-__paravirt_patch_apply_branch(
-	unsigned long tag, unsigned long type,
-	const struct paravirt_patch_branch_target *entries,
-	unsigned int nr_entries)
-{
-	unsigned int i;
-	for (i = 0; i < nr_entries; i++) {
-		if (entries[i].type == type) {
-			paravirt_patch_reloc_br(tag, entries[i].entry);
-			break;
-		}
-	}
-}
-
-static void __init
-paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start,
-			    const struct paravirt_patch_site_branch *end)
-{
-	const struct paravirt_patch_site_branch *p;
-
-	if (noreplace_paravirt)
-		return;
-	if (pv_init_ops.patch_branch == NULL)
-		return;
-
-	for (p = start; p < end; p++)
-		(*pv_init_ops.patch_branch)(p->tag, p->type);
-
-	ia64_sync_i();
-	ia64_srlz_i();
-}
-
-void __init
-paravirt_patch_apply(void)
-{
-	extern const char __start_paravirt_bundles[];
-	extern const char __stop_paravirt_bundles[];
-	extern const char __start_paravirt_insts[];
-	extern const char __stop_paravirt_insts[];
-	extern const char __start_paravirt_branches[];
-	extern const char __stop_paravirt_branches[];
-
-	paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *)
-				    __start_paravirt_bundles,
-				    (const struct paravirt_patch_site_bundle *)
-				    __stop_paravirt_bundles);
-	paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *)
-				  __start_paravirt_insts,
-				  (const struct paravirt_patch_site_inst *)
-				  __stop_paravirt_insts);
-	paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *)
-				    __start_paravirt_branches,
-				    (const struct paravirt_patch_site_branch *)
-				    __stop_paravirt_branches);
-}
-
-/*
- * Local variables:
- * mode: C
- * c-set-style: "linux"
- * c-basic-offset: 8
- * tab-width: 8
- * indent-tabs-mode: t
- * End:
- */
diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c
deleted file mode 100644
index 0a70720662ed..000000000000
--- a/arch/ia64/kernel/paravirt_patchlist.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <linux/bug.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <asm/paravirt.h>
-
-#define DECLARE(name)						\
-	extern unsigned long					\
-		__ia64_native_start_gate_##name##_patchlist[];	\
-	extern unsigned long					\
-		__ia64_native_end_gate_##name##_patchlist[]
-
-DECLARE(fsyscall);
-DECLARE(brl_fsys_bubble_down);
-DECLARE(vtop);
-DECLARE(mckinley_e9);
-
-extern unsigned long __start_gate_section[];
-
-#define ASSIGN(name)							    \
-	.start_##name##_patchlist =					    \
-		(unsigned long)__ia64_native_start_gate_##name##_patchlist, \
-	.end_##name##_patchlist =					    \
-		(unsigned long)__ia64_native_end_gate_##name##_patchlist
-
-struct pv_patchdata pv_patchdata __initdata = {
-	ASSIGN(fsyscall),
-	ASSIGN(brl_fsys_bubble_down),
-	ASSIGN(vtop),
-	ASSIGN(mckinley_e9),
-
-	.gate_section = (void*)__start_gate_section,
-};
-
-
-unsigned long __init
-paravirt_get_gate_patchlist(enum pv_gate_patchlist type)
-{
-
-#define CASE(NAME, name)					\
-	case PV_GATE_START_##NAME:				\
-		return pv_patchdata.start_##name##_patchlist;	\
-	case PV_GATE_END_##NAME:				\
-		return pv_patchdata.end_##name##_patchlist;	\
-
-	switch (type) {
-		CASE(FSYSCALL, fsyscall);
-		CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down);
-		CASE(VTOP, vtop);
-		CASE(MCKINLEY_E9, mckinley_e9);
-	default:
-		BUG();
-		break;
-	}
-	return 0;
-}
-
-void * __init
-paravirt_get_gate_section(void)
-{
-	return pv_patchdata.gate_section;
-}
diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h
deleted file mode 100644
index 67cffc3643a3..000000000000
--- a/arch/ia64/kernel/paravirt_patchlist.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/******************************************************************************
- * linux/arch/ia64/xen/paravirt_patchlist.h
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <asm/native/patchlist.h>
-
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
deleted file mode 100644
index 92d880c4d3d1..000000000000
--- a/arch/ia64/kernel/paravirtentry.S
+++ /dev/null
@@ -1,121 +0,0 @@
-/******************************************************************************
- * linux/arch/ia64/xen/paravirtentry.S
- *
- * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
- *                    VA Linux Systems Japan K.K.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- */
-
-#include <linux/init.h>
-#include <asm/asmmacro.h>
-#include <asm/asm-offsets.h>
-#include <asm/paravirt_privop.h>
-#include <asm/paravirt_patch.h>
-#include "entry.h"
-
-#define DATA8(sym, init_value)			\
-	.pushsection .data..read_mostly ;	\
-	.align 8 ;				\
-	.global sym ;				\
-	sym: ;					\
-	data8 init_value ;			\
-	.popsection
-
-#define BRANCH(targ, reg, breg, type)					\
-	PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ;	\
-	;;								\
-	movl reg=targ ;							\
-	;;								\
-	ld8 reg=[reg] ;							\
-	;;								\
-	mov breg=reg ;							\
-	br.cond.sptk.many breg
-
-#define BRANCH_PROC(sym, reg, breg, type)				\
-	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
-	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
-		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
-	END(paravirt_ ## sym)
-
-#define BRANCH_PROC_UNWINFO(sym, reg, breg, type)			\
-	DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ;		\
-	GLOBAL_ENTRY(paravirt_ ## sym) ;				\
-		PT_REGS_UNWIND_INFO(0) ;				\
-		BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ;	\
-	END(paravirt_ ## sym)
-
-
-BRANCH_PROC(switch_to, r22, b7, SWITCH_TO)
-BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL)
-BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL)
-BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL)
-
-
-#ifdef CONFIG_MODULES
-#define __INIT_OR_MODULE	.text
-#define __INITDATA_OR_MODULE	.data
-#else
-#define __INIT_OR_MODULE	__INIT
-#define __INITDATA_OR_MODULE	__INITDATA
-#endif /* CONFIG_MODULES */
-
-	__INIT_OR_MODULE
-	GLOBAL_ENTRY(paravirt_fc_i)
-	fc.i r32
-	br.ret.sptk.many rp
-	END(paravirt_fc_i)
-	__FINIT
-
-	__INIT_OR_MODULE
-	.align 32
-	GLOBAL_ENTRY(paravirt_nop_b_inst_bundle)
-	{
-		nop.b 0
-		nop.b 0
-		nop.b 0
-	}
-	END(paravirt_nop_b_inst_bundle)
-	__FINIT
-
-	/* NOTE: nop.[mfi] has same format */
-	__INIT_OR_MODULE
-	GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle)
-	{
-		nop.m 0
-		nop.f 0
-		nop.i 0
-	}
-	END(paravirt_nop_mfi_inst_bundle)
-	__FINIT
-
-	__INIT_OR_MODULE
-	GLOBAL_ENTRY(paravirt_nop_bundle)
-paravirt_nop_bundle_start:
-	{
-		nop 0
-		nop 0
-		nop 0
-	}
-paravirt_nop_bundle_end:
-	END(paravirt_nop_bundle)
-	__FINIT
-
-	__INITDATA_OR_MODULE
-	.align 8
-	.global paravirt_nop_bundle_size
-paravirt_nop_bundle_size:
-	data8	paravirt_nop_bundle_end - paravirt_nop_bundle_start
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index 1cf091793714..944a8e2438a6 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -7,7 +7,6 @@
 #include <linux/init.h>
 #include <linux/string.h>
 
-#include <asm/paravirt.h>
 #include <asm/patch.h>
 #include <asm/processor.h>
 #include <asm/sections.h>
@@ -169,35 +168,16 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
 	ia64_srlz_i();
 }
 
-extern unsigned long ia64_native_fsyscall_table[NR_syscalls];
-extern char ia64_native_fsys_bubble_down[];
-struct pv_fsys_data pv_fsys_data __initdata = {
-	.fsyscall_table = (unsigned long *)ia64_native_fsyscall_table,
-	.fsys_bubble_down = (void *)ia64_native_fsys_bubble_down,
-};
-
-unsigned long * __init
-paravirt_get_fsyscall_table(void)
-{
-	return pv_fsys_data.fsyscall_table;
-}
-
-char * __init
-paravirt_get_fsys_bubble_down(void)
-{
-	return pv_fsys_data.fsys_bubble_down;
-}
-
 static void __init
 patch_fsyscall_table (unsigned long start, unsigned long end)
 {
-	u64 fsyscall_table = (u64)paravirt_get_fsyscall_table();
+	extern unsigned long fsyscall_table[NR_syscalls];
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
 	while (offp < (s32 *) end) {
 		ip = (u64) ia64_imva((char *) offp + *offp);
-		ia64_patch_imm64(ip, fsyscall_table);
+		ia64_patch_imm64(ip, (u64) fsyscall_table);
 		ia64_fc((void *) ip);
 		++offp;
 	}
@@ -208,7 +188,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end)
 static void __init
 patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 {
-	u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down();
+	extern char fsys_bubble_down[];
 	s32 *offp = (s32 *) start;
 	u64 ip;
 
@@ -226,13 +206,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
 void __init
 ia64_patch_gate (void)
 {
-#	define START(name)	paravirt_get_gate_patchlist(PV_GATE_START_##name)
-#	define END(name)	paravirt_get_gate_patchlist(PV_GATE_END_##name)
+#	define START(name)	((unsigned long) __start_gate_##name##_patchlist)
+#	define END(name)	((unsigned long)__end_gate_##name##_patchlist)
 
-	patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL));
-	patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN));
-	ia64_patch_vtop(START(VTOP), END(VTOP));
-	ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9));
+	patch_fsyscall_table(START(fsyscall), END(fsyscall));
+	patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
+	ia64_patch_vtop(START(vtop), END(vtop));
+	ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
 }
 
 void ia64_patch_phys_stack_reg(unsigned long val)
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index b9761389cb8d..4f118b0d3091 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -50,8 +50,6 @@
 #include <asm/mca.h>
 #include <asm/meminit.h>
 #include <asm/page.h>
-#include <asm/paravirt.h>
-#include <asm/paravirt_patch.h>
 #include <asm/patch.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -360,8 +358,6 @@ reserve_memory (void)
 	rsvd_region[n].end   = (unsigned long) ia64_imva(_end);
 	n++;
 
-	n += paravirt_reserve_memory(&rsvd_region[n]);
-
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (ia64_boot_param->initrd_start) {
 		rsvd_region[n].start = (unsigned long)__va(ia64_boot_param->initrd_start);
@@ -528,10 +524,7 @@ setup_arch (char **cmdline_p)
 {
 	unw_init();
 
-	paravirt_arch_setup_early();
-
 	ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
-	paravirt_patch_apply();
 
 	*cmdline_p = __va(ia64_boot_param->command_line);
 	strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
@@ -594,9 +587,6 @@ setup_arch (char **cmdline_p)
 	cpu_init();	/* initialize the bootstrap CPU */
 	mmu_context_init();	/* initialize context_id bitmap */
 
-	paravirt_banner();
-	paravirt_arch_setup_console(cmdline_p);
-
 #ifdef CONFIG_VT
 	if (!conswitchp) {
 # if defined(CONFIG_DUMMY_CONSOLE)
@@ -616,8 +606,6 @@ setup_arch (char **cmdline_p)
 #endif
 
 	/* enable IA-64 Machine Check Abort Handling unless disabled */
-	if (paravirt_arch_setup_nomca())
-		nomca = 1;
 	if (!nomca)
 		ia64_mca_init();
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 15051e9c2c6f..0e76fad27975 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -49,7 +49,6 @@
 #include <asm/machvec.h>
 #include <asm/mca.h>
 #include <asm/page.h>
-#include <asm/paravirt.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -127,7 +126,7 @@ int smp_num_siblings = 1;
 volatile int ia64_cpu_to_sapicid[NR_CPUS];
 EXPORT_SYMBOL(ia64_cpu_to_sapicid);
 
-static volatile cpumask_t cpu_callin_map;
+static cpumask_t cpu_callin_map;
 
 struct smp_boot_data smp_boot_data __initdata;
 
@@ -477,6 +476,7 @@ do_boot_cpu (int sapicid, int cpu, struct task_struct *idle)
 	for (timeout = 0; timeout < 100000; timeout++) {
 		if (cpumask_test_cpu(cpu, &cpu_callin_map))
 			break;  /* It has booted */
+		barrier(); /* Make sure we re-read cpu_callin_map */
 		udelay(100);
 	}
 	Dprintk("\n");
@@ -568,7 +568,6 @@ void smp_prepare_boot_cpu(void)
 	cpumask_set_cpu(smp_processor_id(), &cpu_callin_map);
 	set_numa_node(cpu_to_node_map[smp_processor_id()]);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
-	paravirt_post_smp_prepare_boot_cpu();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 9a0104a38cd3..c8dbe2acd735 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -25,7 +25,6 @@
 #include <asm/machvec.h>
 #include <asm/delay.h>
 #include <asm/hw_irq.h>
-#include <asm/paravirt.h>
 #include <asm/ptrace.h>
 #include <asm/sal.h>
 #include <asm/sections.h>
@@ -47,33 +46,12 @@ EXPORT_SYMBOL(last_cli_ip);
 
 #endif
 
-#ifdef CONFIG_PARAVIRT
-/* We need to define a real function for sched_clock, to override the
-   weak default version */
-unsigned long long sched_clock(void)
-{
-        return paravirt_sched_clock();
-}
-#endif
-
-#ifdef CONFIG_PARAVIRT
-static void
-paravirt_clocksource_resume(struct clocksource *cs)
-{
-	if (pv_time_ops.clocksource_resume)
-		pv_time_ops.clocksource_resume();
-}
-#endif
-
 static struct clocksource clocksource_itc = {
 	.name           = "itc",
 	.rating         = 350,
 	.read           = itc_get_cycles,
 	.mask           = CLOCKSOURCE_MASK(64),
 	.flags          = CLOCK_SOURCE_IS_CONTINUOUS,
-#ifdef CONFIG_PARAVIRT
-	.resume		= paravirt_clocksource_resume,
-#endif
 };
 static struct clocksource *itc_clocksource;
 
@@ -164,9 +142,6 @@ timer_interrupt (int irq, void *dev_id)
 
 	profile_tick(CPU_PROFILING);
 
-	if (paravirt_do_steal_accounting(&new_itm))
-		goto skip_process_time_accounting;
-
 	while (1) {
 		update_process_times(user_mode(get_irq_regs()));
 
@@ -187,8 +162,6 @@ timer_interrupt (int irq, void *dev_id)
 		local_irq_disable();
 	}
 
-skip_process_time_accounting:
-
 	do {
 		/*
 		 * If we're too close to the next clock tick for
@@ -337,8 +310,6 @@ void ia64_init_itm(void)
 		 */
 		clocksource_itc.rating = 50;
 
-	paravirt_init_missing_ticks_accounting(smp_processor_id());
-
 	/* avoid softlock up message when cpu is unplug and plugged again. */
 	touch_softlockup_watchdog();
 
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 84f8a52ac5ae..dc506b05ffbd 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -136,27 +136,6 @@ SECTIONS {
 		__end___mckinley_e9_bundles = .;
 	}
 
-#if defined(CONFIG_PARAVIRT)
-	. = ALIGN(16);
-	.paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET) {
-		__start_paravirt_bundles = .;
-		*(.paravirt_bundles)
-		__stop_paravirt_bundles = .;
-	}
-	. = ALIGN(16);
-	.paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET) {
-		__start_paravirt_insts = .;
-		*(.paravirt_insts)
-		__stop_paravirt_insts = .;
-	}
-	. = ALIGN(16);
-	.paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET) {
-		__start_paravirt_branches = .;
-		*(.paravirt_branches)
-		__stop_paravirt_branches = .;
-	}
-#endif
-
 #if defined(CONFIG_IA64_GENERIC)
 	/* Machine Vector */
 	. = ALIGN(16);
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index ba5ba7accd0d..70b40d1205a6 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -11,10 +11,10 @@
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 #include <linux/prefetch.h>
+#include <linux/uaccess.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 
 extern int die(char *, struct pt_regs *, long);
 
@@ -96,7 +96,7 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
 	/*
 	 * If we're in an interrupt or have no user context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 #ifdef CONFIG_VIRTUAL_MEM_MAP
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index a9b65cf7b34a..7f3028965064 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -34,7 +34,6 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/mca.h>
-#include <asm/paravirt.h>
 
 extern void ia64_tlb_init (void);
 
@@ -244,7 +243,6 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
 static void __init
 setup_gate (void)
 {
-	void *gate_section;
 	struct page *page;
 
 	/*
@@ -252,11 +250,10 @@ setup_gate (void)
 	 * headers etc. and once execute-only page to enable
 	 * privilege-promotion via "epc":
 	 */
-	gate_section = paravirt_get_gate_section();
-	page = virt_to_page(ia64_imva(gate_section));
+	page = virt_to_page(ia64_imva(__start_gate_section));
 	put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
 #ifdef HAVE_BUGGY_SEGREL
-	page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE));
+	page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
 	put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
 #else
 	put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
@@ -642,8 +639,8 @@ mem_init (void)
 	 * code can tell them apart.
 	 */
 	for (i = 0; i < NR_syscalls; ++i) {
+		extern unsigned long fsyscall_table[NR_syscalls];
 		extern unsigned long sys_call_table[NR_syscalls];
-		unsigned long *fsyscall_table = paravirt_get_fsyscall_table();
 
 		if (!fsyscall_table[i] || nolwsys)
 			fsyscall_table[i] = sys_call_table[i] | 1;
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index d4e162d35b34..7cc3be9fa7c6 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -478,9 +478,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 
 int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
-	struct pci_controller *controller = bridge->bus->sysdata;
-
-	ACPI_COMPANION_SET(&bridge->dev, controller->companion);
+	/*
+	 * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL
+	 * here, pci_create_root_bus() has been called by someone else and
+	 * sysdata is likely to be different from what we expect.  Let it go in
+	 * that case.
+	 */
+	if (!bridge->dev.parent) {
+		struct pci_controller *controller = bridge->bus->sysdata;
+		ACPI_COMPANION_SET(&bridge->dev, controller->companion);
+	}
 	return 0;
 }
 
diff --git a/arch/ia64/scripts/pvcheck.sed b/arch/ia64/scripts/pvcheck.sed
deleted file mode 100644
index e59809a3fc01..000000000000
--- a/arch/ia64/scripts/pvcheck.sed
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Checker for paravirtualizations of privileged operations.
-#
-s/ssm.*psr\.ic.*/.warning \"ssm psr.ic should not be used directly\"/g
-s/rsm.*psr\.ic.*/.warning \"rsm psr.ic should not be used directly\"/g
-s/ssm.*psr\.i.*/.warning \"ssm psr.i should not be used directly\"/g
-s/rsm.*psr\.i.*/.warning \"rsm psr.i should not be used directly\"/g
-s/ssm.*psr\.dt.*/.warning \"ssm psr.dt should not be used directly\"/g
-s/rsm.*psr\.dt.*/.warning \"rsm psr.dt should not be used directly\"/g
-s/mov.*=.*cr\.ifa/.warning \"cr.ifa should not used directly\"/g
-s/mov.*=.*cr\.itir/.warning \"cr.itir should not used directly\"/g
-s/mov.*=.*cr\.isr/.warning \"cr.isr should not used directly\"/g
-s/mov.*=.*cr\.iha/.warning \"cr.iha should not used directly\"/g
-s/mov.*=.*cr\.ipsr/.warning \"cr.ipsr should not used directly\"/g
-s/mov.*=.*cr\.iim/.warning \"cr.iim should not used directly\"/g
-s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g
-s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g
-s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g	# avoid ar.fpsr
-s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g
-s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g
-s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g
-s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g
-s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g
-s/mov.*cr\.ipsr.*=.*/.warning \"cr.ipsr should not used directly\"/g
-s/mov.*cr\.ifs.*=.*/.warning \"cr.ifs should not used directly\"/g
-s/mov.*cr\.iip.*=.*/.warning \"cr.iip should not used directly\"/g
-s/mov.*cr\.kr.*=.*/.warning \"cr.kr should not used directly\"/g
-s/mov.*ar\.eflags.*=.*/.warning \"ar.eflags should not used directly\"/g
-s/itc\.i.*/.warning \"itc.i should not be used directly.\"/g
-s/itc\.d.*/.warning \"itc.d should not be used directly.\"/g
-s/bsw\.0/.warning \"bsw.0 should not be used directly.\"/g
-s/bsw\.1/.warning \"bsw.1 should not be used directly.\"/g
-s/ptc\.ga.*/.warning \"ptc.ga should not be used directly.\"/g
diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h
index de651db20b43..14bf9b739dd2 100644
--- a/arch/m32r/include/asm/cmpxchg.h
+++ b/arch/m32r/include/asm/cmpxchg.h
@@ -107,8 +107,6 @@ __xchg_local(unsigned long x, volatile void *ptr, int size)
 	((__typeof__(*(ptr)))__xchg_local((unsigned long)(x), (ptr),	\
 			sizeof(*(ptr))))
 
-#define __HAVE_ARCH_CMPXCHG	1
-
 static inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned int old, unsigned int new)
 {
diff --git a/arch/m32r/include/asm/io.h b/arch/m32r/include/asm/io.h
index 9cc00dbd59ce..0c3f25ee3381 100644
--- a/arch/m32r/include/asm/io.h
+++ b/arch/m32r/include/asm/io.h
@@ -68,6 +68,7 @@ static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
 extern void iounmap(volatile void __iomem *addr);
 #define ioremap_nocache(off,size) ioremap(off,size)
 #define ioremap_wc ioremap_nocache
+#define ioremap_wt ioremap_nocache
 
 /*
  * IO bus memory addresses are also 1:1 with the physical address
diff --git a/arch/m32r/include/asm/uaccess.h b/arch/m32r/include/asm/uaccess.h
index 71adff209405..cac7014daef3 100644
--- a/arch/m32r/include/asm/uaccess.h
+++ b/arch/m32r/include/asm/uaccess.h
@@ -91,7 +91,8 @@ static inline void set_fs(mm_segment_t s)
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -155,7 +156,8 @@ extern int fixup_exception(struct pt_regs *regs);
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -175,7 +177,8 @@ extern int fixup_exception(struct pt_regs *regs);
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -194,7 +197,8 @@ extern int fixup_exception(struct pt_regs *regs);
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -274,7 +278,8 @@ do {									\
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -568,7 +573,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -588,7 +594,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -606,7 +613,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -626,7 +634,8 @@ unsigned long __generic_copy_from_user(void *, const void __user *, unsigned lon
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
@@ -677,7 +686,8 @@ unsigned long clear_user(void __user *mem, unsigned long len);
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e3d4d4890104..8f9875b7933d 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -24,9 +24,9 @@
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/m32r.h>
-#include <asm/uaccess.h>
 #include <asm/hardirq.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -111,10 +111,10 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	mm = tsk->mm;
 
 	/*
-	 * If we're in an interrupt or have no user context or are running in an
-	 * atomic region then we must not take the fault..
+	 * If we're in an interrupt or have no user context or have pagefaults
+	 * disabled then we must not take the fault.
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (error_code & ACE_USERMODE)
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index ed1643b4c678..753a6237f99a 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -177,9 +177,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -267,7 +267,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -285,6 +287,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -324,6 +327,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -348,6 +352,7 @@ CONFIG_VETH=m
 CONFIG_A2065=y
 CONFIG_ARIADNE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_CIRRUS is not set
 # CONFIG_NET_VENDOR_HP is not set
@@ -414,7 +419,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_MSM6242=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index d38822b1847e..1f93dcaf02e5 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -175,9 +175,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -265,7 +265,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -277,6 +279,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -306,6 +309,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -327,6 +331,7 @@ CONFIG_NETCONSOLE=m
 CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -375,7 +380,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index c429199cf4a9..831b8b8b92ad 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -175,9 +175,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -265,7 +265,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -281,6 +283,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -315,6 +318,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -337,6 +341,7 @@ CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 CONFIG_ATARILANCE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -391,7 +396,6 @@ CONFIG_DMASOUND_ATARI=m
 CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
-# CONFIG_HID_PLANTRONICS is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
 # CONFIG_IOMMU_SUPPORT is not set
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 9b880371d642..91fd187c16d5 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -173,9 +173,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -263,7 +263,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -275,6 +277,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -305,6 +308,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -326,6 +330,7 @@ CONFIG_NETCONSOLE=m
 CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 CONFIG_BVME6000_NET=y
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -369,7 +374,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 49ae3376e993..9d4934f1d2c3 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -175,9 +175,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -265,7 +265,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -277,6 +279,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -306,6 +309,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -328,6 +332,7 @@ CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 CONFIG_HPLANCE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -378,7 +383,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index ee143a57058c..72bc187ca995 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -174,9 +174,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -267,7 +267,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -280,6 +282,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -315,6 +318,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -344,6 +348,7 @@ CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 CONFIG_MACMACE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 CONFIG_MAC89x0=y
 # CONFIG_NET_VENDOR_INTEL is not set
@@ -400,7 +405,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index c777aa05048f..8fb65535597f 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -184,9 +184,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -277,7 +277,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -299,6 +301,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -348,6 +351,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -384,6 +388,7 @@ CONFIG_MVME147_NET=y
 CONFIG_SUN3LANCE=y
 CONFIG_MACMACE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 CONFIG_MAC89x0=y
 # CONFIG_NET_VENDOR_HP is not set
@@ -468,7 +473,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_MSM6242=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index a7628a85e260..f34491ec0126 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -172,9 +172,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -262,7 +262,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -274,6 +276,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -304,6 +307,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -326,6 +330,7 @@ CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 CONFIG_MVME147_NET=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -369,7 +374,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index ebaa68268a4a..3d3614d1b041 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -173,9 +173,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -263,7 +263,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -275,6 +277,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -305,6 +308,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -326,6 +330,7 @@ CONFIG_NETCONSOLE=m
 CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 CONFIG_MVME16x_NET=y
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -369,7 +374,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 2c16853aedd3..643e9c93bea7 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -173,9 +173,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -263,7 +263,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -278,6 +280,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -311,6 +314,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -334,6 +338,7 @@ CONFIG_VETH=m
 # CONFIG_NET_VENDOR_3COM is not set
 # CONFIG_NET_VENDOR_AMD is not set
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_CIRRUS is not set
 # CONFIG_NET_VENDOR_HP is not set
@@ -390,7 +395,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index e3056bf0f65b..8fecc5aa166c 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -170,9 +170,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -260,7 +260,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -272,6 +274,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -302,6 +305,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -324,6 +328,7 @@ CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 CONFIG_SUN3LANCE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 CONFIG_SUN3_82586=y
 # CONFIG_NET_VENDOR_MARVELL is not set
 # CONFIG_NET_VENDOR_MICREL is not set
@@ -370,7 +375,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 73c36b7a0009..9902c5bfbdc8 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -170,9 +170,9 @@ CONFIG_IP_SET_HASH_NETPORT=m
 CONFIG_IP_SET_HASH_NETIFACE=m
 CONFIG_IP_SET_LIST_SET=m
 CONFIG_NF_CONNTRACK_IPV4=m
-CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_ROUTE_IPV4=m
 CONFIG_NF_TABLES_ARP=m
+CONFIG_NF_LOG_ARP=m
 CONFIG_NFT_CHAIN_NAT_IPV4=m
 CONFIG_NFT_MASQ_IPV4=m
 CONFIG_NFT_REDIR_IPV4=m
@@ -260,7 +260,9 @@ CONFIG_BATMAN_ADV_DAT=y
 CONFIG_BATMAN_ADV_NC=y
 CONFIG_BATMAN_ADV_MCAST=y
 CONFIG_NETLINK_DIAG=m
+CONFIG_MPLS=y
 CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
 # CONFIG_WIRELESS is not set
 # CONFIG_UEVENT_HELPER is not set
 CONFIG_DEVTMPFS=y
@@ -272,6 +274,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_DRBD=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_PMEM=m
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
@@ -302,6 +305,7 @@ CONFIG_DM_RAID=m
 CONFIG_DM_ZERO=m
 CONFIG_DM_MULTIPATH=m
 CONFIG_DM_UEVENT=y
+CONFIG_DM_LOG_WRITES=m
 CONFIG_TARGET_CORE=m
 CONFIG_TCM_IBLOCK=m
 CONFIG_TCM_FILEIO=m
@@ -324,6 +328,7 @@ CONFIG_NETCONSOLE_DYNAMIC=y
 CONFIG_VETH=m
 CONFIG_SUN3LANCE=y
 # CONFIG_NET_VENDOR_ARC is not set
+# CONFIG_NET_CADENCE is not set
 # CONFIG_NET_VENDOR_BROADCOM is not set
 # CONFIG_NET_VENDOR_INTEL is not set
 # CONFIG_NET_VENDOR_MARVELL is not set
@@ -370,7 +375,6 @@ CONFIG_HID=m
 CONFIG_HIDRAW=y
 CONFIG_UHID=m
 # CONFIG_HID_GENERIC is not set
-# CONFIG_HID_PLANTRONICS is not set
 # CONFIG_USB_SUPPORT is not set
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_GENERIC=m
diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h
index bc755bc620ad..83b1df80f0ac 100644
--- a/arch/m68k/include/asm/cmpxchg.h
+++ b/arch/m68k/include/asm/cmpxchg.h
@@ -90,7 +90,6 @@ extern unsigned long __invalid_cmpxchg_size(volatile void *,
  * indicated by comparing RETURN with OLD.
  */
 #ifdef CONFIG_RMW_INSNS
-#define __HAVE_ARCH_CMPXCHG	1
 
 static inline unsigned long __cmpxchg(volatile void *p, unsigned long old,
 				      unsigned long new, int size)
diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h
index 8955b40a5dc4..618c85d3c786 100644
--- a/arch/m68k/include/asm/io_mm.h
+++ b/arch/m68k/include/asm/io_mm.h
@@ -20,6 +20,8 @@
 
 #ifdef __KERNEL__
 
+#define ARCH_HAS_IOREMAP_WT
+
 #include <linux/compiler.h>
 #include <asm/raw_io.h>
 #include <asm/virtconvert.h>
@@ -465,7 +467,7 @@ static inline void __iomem *ioremap_nocache(unsigned long physaddr, unsigned lon
 {
 	return __ioremap(physaddr, size, IOMAP_NOCACHE_SER);
 }
-static inline void __iomem *ioremap_writethrough(unsigned long physaddr,
+static inline void __iomem *ioremap_wt(unsigned long physaddr,
 					 unsigned long size)
 {
 	return __ioremap(physaddr, size, IOMAP_WRITETHROUGH);
diff --git a/arch/m68k/include/asm/io_no.h b/arch/m68k/include/asm/io_no.h
index a93c8cde4d38..ad7bd40e6742 100644
--- a/arch/m68k/include/asm/io_no.h
+++ b/arch/m68k/include/asm/io_no.h
@@ -3,6 +3,8 @@
 
 #ifdef __KERNEL__
 
+#define ARCH_HAS_IOREMAP_WT
+
 #include <asm/virtconvert.h>
 #include <asm-generic/iomap.h>
 
@@ -153,7 +155,7 @@ static inline void *ioremap_nocache(unsigned long physaddr, unsigned long size)
 {
 	return __ioremap(physaddr, size, IOMAP_NOCACHE_SER);
 }
-static inline void *ioremap_writethrough(unsigned long physaddr, unsigned long size)
+static inline void *ioremap_wt(unsigned long physaddr, unsigned long size)
 {
 	return __ioremap(physaddr, size, IOMAP_WRITETHROUGH);
 }
diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h
index a823cd73dc09..b5941818346f 100644
--- a/arch/m68k/include/asm/irqflags.h
+++ b/arch/m68k/include/asm/irqflags.h
@@ -2,9 +2,6 @@
 #define _M68K_IRQFLAGS_H
 
 #include <linux/types.h>
-#ifdef CONFIG_MMU
-#include <linux/preempt_mask.h>
-#endif
 #include <linux/preempt.h>
 #include <asm/thread_info.h>
 #include <asm/entry.h>
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index e546a5534dd4..564665f9af30 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -120,13 +120,16 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t handle,
 }
 EXPORT_SYMBOL(dma_sync_single_for_device);
 
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents,
-			    enum dma_data_direction dir)
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
+			    int nents, enum dma_data_direction dir)
 {
 	int i;
+	struct scatterlist *sg;
 
-	for (i = 0; i < nents; sg++, i++)
-		dma_sync_single_for_device(dev, sg->dma_address, sg->length, dir);
+	for_each_sg(sglist, sg, nents, i) {
+		dma_sync_single_for_device(dev, sg->dma_address, sg->length,
+					   dir);
+	}
 }
 EXPORT_SYMBOL(dma_sync_sg_for_device);
 
@@ -151,14 +154,16 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page,
 }
 EXPORT_SYMBOL(dma_map_page);
 
-int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 	       enum dma_data_direction dir)
 {
 	int i;
+	struct scatterlist *sg;
 
-	for (i = 0; i < nents; sg++, i++) {
+	for_each_sg(sglist, sg, nents, i) {
 		sg->dma_address = sg_phys(sg);
-		dma_sync_single_for_device(dev, sg->dma_address, sg->length, dir);
+		dma_sync_single_for_device(dev, sg->dma_address, sg->length,
+					   dir);
 	}
 	return nents;
 }
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index b2f04aee46ec..6a94cdd0c830 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -10,10 +10,10 @@
 #include <linux/ptrace.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/setup.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 extern void die_if_kernel(char *, struct pt_regs *, long);
@@ -81,7 +81,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h
index d703d8e26a65..5a696e507930 100644
--- a/arch/metag/include/asm/barrier.h
+++ b/arch/metag/include/asm/barrier.h
@@ -84,7 +84,7 @@ static inline void fence(void)
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-#define set_mb(var, value) do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/metag/include/asm/cmpxchg.h b/arch/metag/include/asm/cmpxchg.h
index b1bc1be8540f..be29e3e44321 100644
--- a/arch/metag/include/asm/cmpxchg.h
+++ b/arch/metag/include/asm/cmpxchg.h
@@ -51,8 +51,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 	return old;
 }
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #define cmpxchg(ptr, o, n)						\
 	({								\
 		__typeof__(*(ptr)) _o_ = (o);				\
diff --git a/arch/metag/include/asm/io.h b/arch/metag/include/asm/io.h
index d5779b0ec573..9890f21eadbe 100644
--- a/arch/metag/include/asm/io.h
+++ b/arch/metag/include/asm/io.h
@@ -160,6 +160,9 @@ extern void __iounmap(void __iomem *addr);
 #define ioremap_wc(offset, size)                \
 	__ioremap((offset), (size), _PAGE_WR_COMBINE)
 
+#define ioremap_wt(offset, size)                \
+	__ioremap((offset), (size), 0)
+
 #define iounmap(addr)                           \
 	__iounmap(addr)
 
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 2de5dc695a87..f57edca63609 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -105,7 +105,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	mm = tsk->mm;
 
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/metag/mm/highmem.c b/arch/metag/mm/highmem.c
index d71f621a2c0b..807f1b1c4e65 100644
--- a/arch/metag/mm/highmem.c
+++ b/arch/metag/mm/highmem.c
@@ -43,7 +43,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -82,6 +82,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -95,6 +96,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	unsigned long vaddr;
 	int type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
diff --git a/arch/microblaze/include/asm/io.h b/arch/microblaze/include/asm/io.h
index 940f5fc1d1da..39b6315db82e 100644
--- a/arch/microblaze/include/asm/io.h
+++ b/arch/microblaze/include/asm/io.h
@@ -39,10 +39,10 @@ extern resource_size_t isa_mem_base;
 extern void iounmap(void __iomem *addr);
 
 extern void __iomem *ioremap(phys_addr_t address, unsigned long size);
-#define ioremap_writethrough(addr, size)	ioremap((addr), (size))
 #define ioremap_nocache(addr, size)		ioremap((addr), (size))
 #define ioremap_fullcache(addr, size)		ioremap((addr), (size))
 #define ioremap_wc(addr, size)			ioremap((addr), (size))
+#define ioremap_wt(addr, size)			ioremap((addr), (size))
 
 #endif /* CONFIG_MMU */
 
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index 468aca8cec0d..b42ed684b945 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -27,8 +27,6 @@
 #define PCIBIOS_MIN_IO		0x1000
 #define PCIBIOS_MIN_MEM		0x10000000
 
-struct pci_dev;
-
 /* Values for the `which' argument to sys_pciconfig_iobase syscall.  */
 #define IOBASE_BRIDGE_NUMBER	0
 #define IOBASE_MEMORY		1
@@ -44,16 +42,6 @@ struct pci_dev;
  */
 #define pcibios_assign_all_busses()	0
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 extern int pci_domain_nr(struct pci_bus *bus);
 
 /* Decide whether to display the domain number in /proc */
@@ -83,40 +71,12 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus,
  */
 #define PCI_DMA_BUS_IS_PHYS     (1)
 
-static inline struct resource *pcibios_select_root(struct pci_dev *pdev,
-			struct resource *res)
-{
-	struct resource *root = NULL;
-
-	if (res->flags & IORESOURCE_IO)
-		root = &ioport_resource;
-	if (res->flags & IORESOURCE_MEM)
-		root = &iomem_resource;
-
-	return root;
-}
-
 extern void pcibios_claim_one_bus(struct pci_bus *b);
 
 extern void pcibios_finish_adding_to_bus(struct pci_bus *bus);
 
 extern void pcibios_resource_survey(void);
 
-extern struct pci_controller *init_phb_dynamic(struct device_node *dn);
-extern int remove_phb_dynamic(struct pci_controller *phb);
-
-extern struct pci_dev *of_create_pci_dev(struct device_node *node,
-					struct pci_bus *bus, int devfn);
-
-extern void of_scan_pci_bridge(struct device_node *node,
-				struct pci_dev *dev);
-
-extern void of_scan_bus(struct device_node *node, struct pci_bus *bus);
-extern void of_rescan_bus(struct device_node *node, struct pci_bus *bus);
-
-extern int pci_bus_find_capability(struct pci_bus *bus,
-						unsigned int devfn, int cap);
-
 struct file;
 extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 					 unsigned long pfn,
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 62942fd12672..331b0d35f89c 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -178,7 +178,8 @@ extern long __user_bad(void);
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -290,7 +291,8 @@ extern long __user_bad(void);
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/microblaze/kernel/cpu/cpuinfo.c b/arch/microblaze/kernel/cpu/cpuinfo.c
index d1dd6e83d59b..b70bb538f001 100644
--- a/arch/microblaze/kernel/cpu/cpuinfo.c
+++ b/arch/microblaze/kernel/cpu/cpuinfo.c
@@ -47,6 +47,8 @@ const struct cpu_ver_key cpu_ver_lookup[] = {
 	{"9.1", 0x1d},
 	{"9.2", 0x1f},
 	{"9.3", 0x20},
+	{"9.4", 0x21},
+	{"9.5", 0x22},
 	{NULL, 0},
 };
 
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index ed7ba8a11822..bf4dec229437 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -154,6 +154,7 @@ dma_direct_sync_sg_for_device(struct device *dev,
 			__dma_sync(sg->dma_address, sg->length, direction);
 }
 
+static
 int dma_direct_mmap_coherent(struct device *dev, struct vm_area_struct *vma,
 			     void *cpu_addr, dma_addr_t handle, size_t size,
 			     struct dma_attrs *attrs)
diff --git a/arch/microblaze/kernel/kgdb.c b/arch/microblaze/kernel/kgdb.c
index 8736af5806ae..6366f69d118e 100644
--- a/arch/microblaze/kernel/kgdb.c
+++ b/arch/microblaze/kernel/kgdb.c
@@ -32,7 +32,7 @@
 #define GDB_RTLBHI	56
 
 /* keep pvr separately because it is unchangeble */
-struct pvr_s pvr;
+static struct pvr_s pvr;
 
 void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 {
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index d46a5ebb7570..177dfc003643 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -107,14 +107,14 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
 	if ((error_code & 0x13) == 0x13 || (error_code & 0x11) == 0x11)
 		is_write = 0;
 
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		if (kernel_mode(regs))
 			goto bad_area_nosemaphore;
 
-		/* in_atomic() in user mode is really bad,
+		/* faulthandler_disabled() in user mode is really bad,
 		   as is current->mm == NULL. */
-		pr_emerg("Page fault in user mode with in_atomic(), mm = %p\n",
-									mm);
+		pr_emerg("Page fault in user mode with faulthandler_disabled(), mm = %p\n",
+			 mm);
 		pr_emerg("r15 = %lx  MSR = %lx\n",
 		       regs->r15, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c
index 5a92576fad92..2fcc5a52d84d 100644
--- a/arch/microblaze/mm/highmem.c
+++ b/arch/microblaze/mm/highmem.c
@@ -37,7 +37,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -63,6 +63,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -84,5 +85,6 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f5016656494f..b65edf514b40 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -819,6 +819,7 @@ config CAVIUM_OCTEON_SOC
 	select SYS_SUPPORTS_64BIT_KERNEL
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select EDAC_SUPPORT
+	select EDAC_ATOMIC_SCRUB
 	select SYS_SUPPORTS_LITTLE_ENDIAN
 	select SYS_SUPPORTS_HOTPLUG_CPU if CPU_BIG_ENDIAN
 	select SYS_HAS_EARLY_PRINTK
diff --git a/arch/mips/ath79/prom.c b/arch/mips/ath79/prom.c
index e1fe63051136..597899ad5438 100644
--- a/arch/mips/ath79/prom.c
+++ b/arch/mips/ath79/prom.c
@@ -1,6 +1,7 @@
 /*
  *  Atheros AR71XX/AR724X/AR913X specific prom routines
  *
+ *  Copyright (C) 2015 Laurent Fasnacht <l@libres.ch>
  *  Copyright (C) 2008-2010 Gabor Juhos <juhosg@openwrt.org>
  *  Copyright (C) 2008 Imre Kaloz <kaloz@openwrt.org>
  *
@@ -25,12 +26,14 @@ void __init prom_init(void)
 {
 	fw_init_cmdline();
 
+#ifdef CONFIG_BLK_DEV_INITRD
 	/* Read the initrd address from the firmware environment */
 	initrd_start = fw_getenvl("initrd_start");
 	if (initrd_start) {
 		initrd_start = KSEG0ADDR(initrd_start);
 		initrd_end = initrd_start + fw_getenvl("initrd_size");
 	}
+#endif
 }
 
 void __init prom_free_prom_memory(void)
diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c
index a73c93c3d44a..7fc8397d16f2 100644
--- a/arch/mips/ath79/setup.c
+++ b/arch/mips/ath79/setup.c
@@ -225,7 +225,7 @@ void __init plat_time_init(void)
 	ddr_clk_rate = ath79_get_sys_clk_rate("ddr");
 	ref_clk_rate = ath79_get_sys_clk_rate("ref");
 
-	pr_info("Clocks: CPU:%lu.%03luMHz, DDR:%lu.%03luMHz, AHB:%lu.%03luMHz, Ref:%lu.%03luMHz",
+	pr_info("Clocks: CPU:%lu.%03luMHz, DDR:%lu.%03luMHz, AHB:%lu.%03luMHz, Ref:%lu.%03luMHz\n",
 		cpu_clk_rate / 1000000, (cpu_clk_rate / 1000) % 1000,
 		ddr_clk_rate / 1000000, (ddr_clk_rate / 1000) % 1000,
 		ahb_clk_rate / 1000000, (ahb_clk_rate / 1000) % 1000,
diff --git a/arch/mips/cavium-octeon/crypto/octeon-md5.c b/arch/mips/cavium-octeon/crypto/octeon-md5.c
index 12dccdb38286..af4c712f7afc 100644
--- a/arch/mips/cavium-octeon/crypto/octeon-md5.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c
@@ -69,10 +69,10 @@ static int octeon_md5_init(struct shash_desc *desc)
 {
 	struct md5_state *mctx = shash_desc_ctx(desc);
 
-	mctx->hash[0] = cpu_to_le32(0x67452301);
-	mctx->hash[1] = cpu_to_le32(0xefcdab89);
-	mctx->hash[2] = cpu_to_le32(0x98badcfe);
-	mctx->hash[3] = cpu_to_le32(0x10325476);
+	mctx->hash[0] = cpu_to_le32(MD5_H0);
+	mctx->hash[1] = cpu_to_le32(MD5_H1);
+	mctx->hash[2] = cpu_to_le32(MD5_H2);
+	mctx->hash[3] = cpu_to_le32(MD5_H3);
 	mctx->byte_count = 0;
 
 	return 0;
diff --git a/arch/mips/cobalt/Makefile b/arch/mips/cobalt/Makefile
index 558e94977942..68f0c5871adc 100644
--- a/arch/mips/cobalt/Makefile
+++ b/arch/mips/cobalt/Makefile
@@ -2,7 +2,6 @@
 # Makefile for the Cobalt micro systems family specific parts of the kernel
 #
 
-obj-y := buttons.o irq.o lcd.o led.o reset.o rtc.o serial.o setup.o time.o
+obj-y := buttons.o irq.o lcd.o led.o mtd.o reset.o rtc.o serial.o setup.o time.o
 
 obj-$(CONFIG_PCI)		+= pci.o
-obj-$(CONFIG_MTD_PHYSMAP)	+= mtd.o
diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig
index 002680648dcb..b2a577ebce0b 100644
--- a/arch/mips/configs/fuloong2e_defconfig
+++ b/arch/mips/configs/fuloong2e_defconfig
@@ -194,7 +194,7 @@ CONFIG_USB_WUSB_CBAF=m
 CONFIG_USB_C67X00_HCD=m
 CONFIG_USB_EHCI_HCD=y
 CONFIG_USB_EHCI_ROOT_HUB_TT=y
-CONFIG_USB_ISP1760_HCD=m
+CONFIG_USB_ISP1760=m
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=m
 CONFIG_USB_R8A66597_HCD=m
diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h
index 2b8bbbcb9be0..7ecba84656d4 100644
--- a/arch/mips/include/asm/barrier.h
+++ b/arch/mips/include/asm/barrier.h
@@ -112,8 +112,8 @@
 #define __WEAK_LLSC_MB		"		\n"
 #endif
 
-#define set_mb(var, value) \
-	do { var = value; smp_mb(); } while (0)
+#define smp_store_mb(var, value) \
+	do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 
 #define smp_llsc_mb()	__asm__ __volatile__(__WEAK_LLSC_MB : : :"memory")
 
diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h
index 412f945f1f5e..b71ab4a5fd50 100644
--- a/arch/mips/include/asm/cmpxchg.h
+++ b/arch/mips/include/asm/cmpxchg.h
@@ -138,8 +138,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 		__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))));	\
 })
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 #define __cmpxchg_asm(ld, st, m, old, new)				\
 ({									\
 	__typeof(*(m)) __ret;						\
diff --git a/arch/mips/include/asm/edac.h b/arch/mips/include/asm/edac.h
index 94105d3f58f4..980b16527374 100644
--- a/arch/mips/include/asm/edac.h
+++ b/arch/mips/include/asm/edac.h
@@ -5,7 +5,7 @@
 
 /* ECC atomic, DMA, SMP and interrupt safe scrub function */
 
-static inline void atomic_scrub(void *va, u32 size)
+static inline void edac_atomic_scrub(void *va, u32 size)
 {
 	unsigned long *virt_addr = va;
 	unsigned long temp;
@@ -21,7 +21,7 @@ static inline void atomic_scrub(void *va, u32 size)
 
 		__asm__ __volatile__ (
 		"	.set	mips2					\n"
-		"1:	ll	%0, %1		# atomic_scrub		\n"
+		"1:	ll	%0, %1		# edac_atomic_scrub	\n"
 		"	addu	%0, $0					\n"
 		"	sc	%0, %1					\n"
 		"	beqz	%0, 1b					\n"
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 4c25823563fe..e8c8d9d0c45f 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -839,7 +839,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
diff --git a/arch/mips/include/asm/mach-ath79/ath79_spi_platform.h b/arch/mips/include/asm/mach-ath79/ath79_spi_platform.h
index aa2283e602fc..aa71216edf99 100644
--- a/arch/mips/include/asm/mach-ath79/ath79_spi_platform.h
+++ b/arch/mips/include/asm/mach-ath79/ath79_spi_platform.h
@@ -16,8 +16,4 @@ struct ath79_spi_platform_data {
 	unsigned	num_chipselect;
 };
 
-struct ath79_spi_controller_data {
-	unsigned	gpio;
-};
-
 #endif /* _ATH79_SPI_PLATFORM_H */
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index d9692993fc83..70dcc5498128 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -113,16 +113,6 @@ struct pci_dev;
  */
 extern unsigned int PCI_DMA_BUS_IS_PHYS;
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #ifdef CONFIG_PCI_DOMAINS
 #define pci_domain_nr(bus) ((struct pci_controller *)(bus)->sysdata)->index
 
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h
index 18ae5ddef118..c28a8499aec7 100644
--- a/arch/mips/include/asm/pgtable-bits.h
+++ b/arch/mips/include/asm/pgtable-bits.h
@@ -113,7 +113,7 @@
 #define _PAGE_PRESENT_SHIFT	0
 #define _PAGE_PRESENT		(1 << _PAGE_PRESENT_SHIFT)
 /* R2 or later cores check for RI/XI support to determine _PAGE_READ */
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 #define _PAGE_WRITE_SHIFT	(_PAGE_PRESENT_SHIFT + 1)
 #define _PAGE_WRITE		(1 << _PAGE_WRITE_SHIFT)
 #else
@@ -135,16 +135,16 @@
 #define _PAGE_SPLITTING		(1 << _PAGE_SPLITTING_SHIFT)
 
 /* Only R2 or newer cores have the XI bit */
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 #define _PAGE_NO_EXEC_SHIFT	(_PAGE_SPLITTING_SHIFT + 1)
 #else
 #define _PAGE_GLOBAL_SHIFT	(_PAGE_SPLITTING_SHIFT + 1)
 #define _PAGE_GLOBAL		(1 << _PAGE_GLOBAL_SHIFT)
-#endif	/* CONFIG_CPU_MIPSR2 */
+#endif	/* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */
 
 #endif	/* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */
 
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 /* XI - page cannot be executed */
 #ifndef _PAGE_NO_EXEC_SHIFT
 #define _PAGE_NO_EXEC_SHIFT	(_PAGE_MODIFIED_SHIFT + 1)
@@ -160,10 +160,10 @@
 #define _PAGE_GLOBAL_SHIFT	(_PAGE_NO_READ_SHIFT + 1)
 #define _PAGE_GLOBAL		(1 << _PAGE_GLOBAL_SHIFT)
 
-#else	/* !CONFIG_CPU_MIPSR2 */
+#else	/* !CONFIG_CPU_MIPSR2 && !CONFIG_CPU_MIPSR6 */
 #define _PAGE_GLOBAL_SHIFT	(_PAGE_MODIFIED_SHIFT + 1)
 #define _PAGE_GLOBAL		(1 << _PAGE_GLOBAL_SHIFT)
-#endif	/* CONFIG_CPU_MIPSR2 */
+#endif	/* CONFIG_CPU_MIPSR2 || CONFIG_CPU_MIPSR6 */
 
 #define _PAGE_VALID_SHIFT	(_PAGE_GLOBAL_SHIFT + 1)
 #define _PAGE_VALID		(1 << _PAGE_VALID_SHIFT)
@@ -205,7 +205,7 @@
  */
 static inline uint64_t pte_to_entrylo(unsigned long pte_val)
 {
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6)
 	if (cpu_has_rixi) {
 		int sa;
 #ifdef CONFIG_32BIT
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index e92d6c4b5ed1..7163cd7fdd69 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -104,7 +104,6 @@ do {									\
 	if (test_and_clear_tsk_thread_flag(prev, TIF_USEDMSA))		\
 		__fpsave = FP_SAVE_VECTOR;				\
 	(last) = resume(prev, next, task_thread_info(next), __fpsave);	\
-	disable_msa();							\
 } while (0)
 
 #define finish_arch_switch(prev)					\
@@ -122,6 +121,7 @@ do {									\
 	if (cpu_has_userlocal)						\
 		write_c0_userlocal(current_thread_info()->tp_value);	\
 	__restore_watch();						\
+	disable_msa();							\
 } while (0)
 
 #endif /* _ASM_SWITCH_TO_H */
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 3e307ec2afba..7afda4150a59 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -15,7 +15,7 @@
 #define topology_physical_package_id(cpu)	(cpu_data[cpu].package)
 #define topology_core_id(cpu)			(cpu_data[cpu].core)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&cpu_sibling_map[cpu])
+#define topology_sibling_cpumask(cpu)		(&cpu_sibling_map[cpu])
 #endif
 
 #endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index bf8b32450ef6..9722357d2854 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -103,7 +103,8 @@ extern u64 __ua_limit;
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -138,7 +139,8 @@ extern u64 __ua_limit;
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -157,7 +159,8 @@ extern u64 __ua_limit;
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -177,7 +180,8 @@ extern u64 __ua_limit;
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -199,7 +203,8 @@ extern u64 __ua_limit;
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -498,7 +503,8 @@ extern void __put_user_unknown(void);
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -517,7 +523,8 @@ extern void __put_user_unknown(void);
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -537,7 +544,8 @@ extern void __put_user_unknown(void);
  * @x:	 Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -559,7 +567,8 @@ extern void __put_user_unknown(void);
  * @x:	 Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -815,7 +824,8 @@ extern size_t __copy_user(void *__to, const void *__from, size_t __n);
  * @from: Source address, in kernel space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -888,7 +898,8 @@ extern size_t __copy_user_inatomic(void *__to, const void *__from, size_t __n);
  * @from: Source address, in kernel space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -1075,7 +1086,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
  * @from: Source address, in user space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -1107,7 +1119,8 @@ extern size_t __copy_in_user_eva(void *__to, const void *__from, size_t __n);
  * @from: Source address, in user space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
@@ -1329,7 +1342,8 @@ strncpy_from_user(char *__to, const char __user *__from, long __len)
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
@@ -1398,7 +1412,8 @@ static inline long __strnlen_user(const char __user *s, long n)
  * strnlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c
index e36515dcd3b2..209e5b76c1bc 100644
--- a/arch/mips/kernel/cpu-probe.c
+++ b/arch/mips/kernel/cpu-probe.c
@@ -74,13 +74,12 @@ static inline void cpu_set_fpu_fcsr_mask(struct cpuinfo_mips *c)
 {
 	unsigned long sr, mask, fcsr, fcsr0, fcsr1;
 
+	fcsr = c->fpu_csr31;
 	mask = FPU_CSR_ALL_X | FPU_CSR_ALL_E | FPU_CSR_ALL_S | FPU_CSR_RM;
 
 	sr = read_c0_status();
 	__enable_fpu(FPU_AS_IS);
 
-	fcsr = read_32bit_cp1_register(CP1_STATUS);
-
 	fcsr0 = fcsr & mask;
 	write_32bit_cp1_register(CP1_STATUS, fcsr0);
 	fcsr0 = read_32bit_cp1_register(CP1_STATUS);
diff --git a/arch/mips/kernel/irq.c b/arch/mips/kernel/irq.c
index d2bfbc2e8995..3c8a18a00a65 100644
--- a/arch/mips/kernel/irq.c
+++ b/arch/mips/kernel/irq.c
@@ -29,7 +29,7 @@
 int kgdb_early_setup;
 #endif
 
-static unsigned long irq_map[NR_IRQS / BITS_PER_LONG];
+static DECLARE_BITMAP(irq_map, NR_IRQS);
 
 int allocate_irqno(void)
 {
@@ -109,7 +109,7 @@ void __init init_IRQ(void)
 #endif
 }
 
-#ifdef DEBUG_STACKOVERFLOW
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
 static inline void check_stack_overflow(void)
 {
 	unsigned long sp;
diff --git a/arch/mips/kernel/signal-common.h b/arch/mips/kernel/signal-common.h
index 06805e09bcd3..0b85f827cd18 100644
--- a/arch/mips/kernel/signal-common.h
+++ b/arch/mips/kernel/signal-common.h
@@ -28,12 +28,7 @@ extern void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 extern int fpcsr_pending(unsigned int __user *fpcsr);
 
 /* Make sure we will not lose FPU ownership */
-#ifdef CONFIG_PREEMPT
-#define lock_fpu_owner()	preempt_disable()
-#define unlock_fpu_owner()	preempt_enable()
-#else
-#define lock_fpu_owner()	pagefault_disable()
-#define unlock_fpu_owner()	pagefault_enable()
-#endif
+#define lock_fpu_owner()	({ preempt_disable(); pagefault_disable(); })
+#define unlock_fpu_owner()	({ pagefault_enable(); preempt_enable(); })
 
 #endif	/* __SIGNAL_COMMON_H */
diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c
index fd528d7ea278..336708ae5c5b 100644
--- a/arch/mips/kernel/smp-bmips.c
+++ b/arch/mips/kernel/smp-bmips.c
@@ -444,7 +444,7 @@ struct plat_smp_ops bmips5000_smp_ops = {
 static void bmips_wr_vec(unsigned long dst, char *start, char *end)
 {
 	memcpy((void *)dst, start, end - start);
-	dma_cache_wback((unsigned long)start, end - start);
+	dma_cache_wback(dst, end - start);
 	local_flush_icache_range(dst, dst + (end - start));
 	instruction_hazard();
 }
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 4b50c5787e25..d5fa3eaf39a1 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -2409,7 +2409,7 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 		if (vcpu->mmio_needed == 2)
 			*gpr = *(int16_t *) run->mmio.data;
 		else
-			*gpr = *(int16_t *) run->mmio.data;
+			*gpr = *(uint16_t *)run->mmio.data;
 
 		break;
 	case 1:
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index bb68e8d520e8..cd4c129ce743 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -198,15 +198,16 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
 	unsigned long npages = 0;
@@ -393,7 +394,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	kvm_mips_deliver_interrupts(vcpu,
 				    kvm_read_c0_guest_cause(vcpu->arch.cop0));
 
-	kvm_guest_enter();
+	__kvm_guest_enter();
 
 	/* Disable hardware page table walking while in guest */
 	htw_stop();
@@ -403,7 +404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	/* Re-enable HTW before enabling interrupts */
 	htw_start();
 
-	kvm_guest_exit();
+	__kvm_guest_exit();
 	local_irq_enable();
 
 	if (vcpu->sigset_active)
@@ -968,6 +969,7 @@ out:
 /* Get (and clear) the dirty memory log for a memory slot. */
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 	unsigned long ga, ga_end;
 	int is_dirty = 0;
@@ -982,7 +984,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
-		memslot = &kvm->memslots->memslots[log->slot];
+		slots = kvm_memslots(kvm);
+		memslot = id_to_memslot(slots, log->slot);
 
 		ga = memslot->base_gfn << PAGE_SHIFT;
 		ga_end = ga + (memslot->npages << PAGE_SHIFT);
diff --git a/arch/mips/lib/strnlen_user.S b/arch/mips/lib/strnlen_user.S
index 7d12c0dded3d..77e64942f004 100644
--- a/arch/mips/lib/strnlen_user.S
+++ b/arch/mips/lib/strnlen_user.S
@@ -34,7 +34,12 @@ LEAF(__strnlen_\func\()_asm)
 FEXPORT(__strnlen_\func\()_nocheck_asm)
 	move		v0, a0
 	PTR_ADDU	a1, a0			# stop pointer
-1:	beq		v0, a1, 1f		# limit reached?
+1:
+#ifdef CONFIG_CPU_DADDI_WORKAROUNDS
+	.set		noat
+	li		AT, 1
+#endif
+	beq		v0, a1, 1f		# limit reached?
 .ifeqs "\func", "kernel"
 	EX(lb, t0, (v0), .Lfault\@)
 .else
@@ -42,7 +47,13 @@ FEXPORT(__strnlen_\func\()_nocheck_asm)
 .endif
 	.set		noreorder
 	bnez		t0, 1b
-1:	 PTR_ADDIU	v0, 1
+1:
+#ifndef CONFIG_CPU_DADDI_WORKAROUNDS
+	 PTR_ADDIU	v0, 1
+#else
+	 PTR_ADDU	v0, AT
+	.set		at
+#endif
 	.set		reorder
 	PTR_SUBU	v0, a0
 	jr		ra
diff --git a/arch/mips/loongson/common/Makefile b/arch/mips/loongson/common/Makefile
index e70c33fdb881..f2e8153e44f5 100644
--- a/arch/mips/loongson/common/Makefile
+++ b/arch/mips/loongson/common/Makefile
@@ -3,15 +3,13 @@
 #
 
 obj-y += setup.o init.o cmdline.o env.o time.o reset.o irq.o \
-    bonito-irq.o mem.o machtype.o platform.o
+    bonito-irq.o mem.o machtype.o platform.o serial.o
 obj-$(CONFIG_PCI) += pci.o
 
 #
 # Serial port support
 #
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
-loongson-serial-$(CONFIG_SERIAL_8250) := serial.o
-obj-y += $(loongson-serial-m) $(loongson-serial-y)
 obj-$(CONFIG_LOONGSON_UART_BASE) += uart_base.o
 obj-$(CONFIG_LOONGSON_MC146818) += rtc.o
 
diff --git a/arch/mips/loongson/loongson-3/smp.c b/arch/mips/loongson/loongson-3/smp.c
index e3c68b5da18d..509877c6e9d9 100644
--- a/arch/mips/loongson/loongson-3/smp.c
+++ b/arch/mips/loongson/loongson-3/smp.c
@@ -272,7 +272,7 @@ void loongson3_ipi_interrupt(struct pt_regs *regs)
 	if (action & SMP_ASK_C0COUNT) {
 		BUG_ON(cpu != 0);
 		c0count = read_c0_count();
-		for (i = 1; i < loongson_sysconf.nr_cpus; i++)
+		for (i = 1; i < num_possible_cpus(); i++)
 			per_cpu(core0_c0count, i) = c0count;
 	}
 }
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 0dbb65a51ce5..2e03ab173591 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1372,7 +1372,7 @@ static int probe_scache(void)
 	scache_size = addr;
 	c->scache.linesz = 16 << ((config & R4K_CONF_SB) >> 22);
 	c->scache.ways = 1;
-	c->dcache.waybit = 0;		/* does not matter */
+	c->scache.waybit = 0;		/* does not matter */
 
 	return 1;
 }
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 7ff8637e530d..36c0f26fac6b 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -21,10 +21,10 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 
 #include <asm/branch.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/highmem.h>		/* For VMALLOC_END */
 #include <linux/kdebug.h>
@@ -94,7 +94,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c
index da815d295239..11661cbc11a8 100644
--- a/arch/mips/mm/highmem.c
+++ b/arch/mips/mm/highmem.c
@@ -47,7 +47,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -72,6 +72,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -92,6 +93,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
@@ -104,6 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index faa5c9822ecc..198a3147dd7d 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -90,6 +90,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot)
 
 	BUG_ON(Page_dcache_dirty(page));
 
+	preempt_disable();
 	pagefault_disable();
 	idx = (addr >> PAGE_SHIFT) & (FIX_N_COLOURS - 1);
 	idx += in_interrupt() ? FIX_N_COLOURS : 0;
@@ -152,6 +153,7 @@ void kunmap_coherent(void)
 	write_c0_entryhi(old_ctx);
 	local_irq_restore(flags);
 	pagefault_enable();
+	preempt_enable();
 }
 
 void copy_user_highpage(struct page *to, struct page *from,
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 5d6139390bf8..e23fdf2a9c80 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -681,11 +681,7 @@ static unsigned int get_stack_depth(struct jit_ctx *ctx)
 		sp_off += config_enabled(CONFIG_64BIT) ?
 			(ARGS_USED_BY_JIT + 1) * RSIZE : RSIZE;
 
-	/*
-	 * Subtract the bytes for the last registers since we only care about
-	 * the location on the stack pointer.
-	 */
-	return sp_off - RSIZE;
+	return sp_off;
 }
 
 static void build_prologue(struct jit_ctx *ctx)
diff --git a/arch/mips/pci/fixup-cobalt.c b/arch/mips/pci/fixup-cobalt.c
index a138e8ee5cfc..b3ab59318d91 100644
--- a/arch/mips/pci/fixup-cobalt.c
+++ b/arch/mips/pci/fixup-cobalt.c
@@ -13,7 +13,6 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 
-#include <asm/pci.h>
 #include <asm/io.h>
 #include <asm/gt64120.h>
 
diff --git a/arch/mips/pci/ops-mace.c b/arch/mips/pci/ops-mace.c
index 6b5821febc38..951d8070fb48 100644
--- a/arch/mips/pci/ops-mace.c
+++ b/arch/mips/pci/ops-mace.c
@@ -8,7 +8,6 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/types.h>
-#include <asm/pci.h>
 #include <asm/ip32/mace.h>
 
 #if 0
diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c
index 8b117e638306..c5347d99cf3a 100644
--- a/arch/mips/pci/pci-lantiq.c
+++ b/arch/mips/pci/pci-lantiq.c
@@ -20,7 +20,6 @@
 #include <linux/of_irq.h>
 #include <linux/of_pci.h>
 
-#include <asm/pci.h>
 #include <asm/gpio.h>
 #include <asm/addrspace.h>
 
diff --git a/arch/mips/ralink/ill_acc.c b/arch/mips/ralink/ill_acc.c
index e20b02e3ae28..e10d10b9e82a 100644
--- a/arch/mips/ralink/ill_acc.c
+++ b/arch/mips/ralink/ill_acc.c
@@ -41,7 +41,7 @@ static irqreturn_t ill_acc_irq_handler(int irq, void *_priv)
 		addr, (type >> ILL_ACC_OFF_S) & ILL_ACC_OFF_M,
 		type & ILL_ACC_LEN_M);
 
-	rt_memc_w32(REG_ILL_ACC_TYPE, REG_ILL_ACC_TYPE);
+	rt_memc_w32(ILL_INT_STATUS, REG_ILL_ACC_TYPE);
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h
index 2fbbe4d920aa..1ddea5afba09 100644
--- a/arch/mn10300/include/asm/highmem.h
+++ b/arch/mn10300/include/asm/highmem.h
@@ -75,6 +75,7 @@ static inline void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 	if (page < highmem_start_page)
 		return page_address(page);
@@ -98,6 +99,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
 
 	if (vaddr < FIXADDR_START) { /* FIXME */
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -122,6 +124,7 @@ static inline void __kunmap_atomic(unsigned long vaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 #endif /* __KERNEL__ */
 
diff --git a/arch/mn10300/include/asm/io.h b/arch/mn10300/include/asm/io.h
index cc4a2ba9e228..07c5b4a3903b 100644
--- a/arch/mn10300/include/asm/io.h
+++ b/arch/mn10300/include/asm/io.h
@@ -282,6 +282,7 @@ static inline void __iomem *ioremap_nocache(unsigned long offset, unsigned long
 }
 
 #define ioremap_wc ioremap_nocache
+#define ioremap_wt ioremap_nocache
 
 static inline void iounmap(void __iomem *addr)
 {
diff --git a/arch/mn10300/include/asm/pci.h b/arch/mn10300/include/asm/pci.h
index 5f70af25c7d0..c222d1792d5b 100644
--- a/arch/mn10300/include/asm/pci.h
+++ b/arch/mn10300/include/asm/pci.h
@@ -83,19 +83,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-static inline struct resource *
-pcibios_select_root(struct pci_dev *pdev, struct resource *res)
-{
-	struct resource *root = NULL;
-
-	if (res->flags & IORESOURCE_IO)
-		root = &ioport_resource;
-	if (res->flags & IORESOURCE_MEM)
-		root = &iomem_resource;
-
-	return root;
-}
-
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
 	return channel ? 15 : 14;
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 0c2cc5d39c8e..4a1d181ed32f 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -23,8 +23,8 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/hardirq.h>
 #include <asm/cpu-regs.h>
@@ -168,7 +168,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
diff --git a/arch/nios2/include/asm/io.h b/arch/nios2/include/asm/io.h
index 6e24d7cceb0c..c5a62da22cd2 100644
--- a/arch/nios2/include/asm/io.h
+++ b/arch/nios2/include/asm/io.h
@@ -46,6 +46,7 @@ static inline void iounmap(void __iomem *addr)
 }
 
 #define ioremap_wc ioremap_nocache
+#define ioremap_wt ioremap_nocache
 
 /* Pages to physical address... */
 #define page_to_phys(page)	virt_to_phys(page_to_virt(page))
diff --git a/arch/nios2/kernel/time.c b/arch/nios2/kernel/time.c
index 7f4547418ee1..be186a75f622 100644
--- a/arch/nios2/kernel/time.c
+++ b/arch/nios2/kernel/time.c
@@ -8,6 +8,7 @@
  * for more details.
  */
 
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/clockchips.h>
 #include <linux/clocksource.h>
@@ -106,6 +107,7 @@ cycles_t get_cycles(void)
 {
 	return nios2_timer_read(&nios2_cs.cs);
 }
+EXPORT_SYMBOL(get_cycles);
 
 static void nios2_timer_start(struct nios2_timer *timer)
 {
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 0c9b6afe69e9..b51878b0c6b8 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -77,7 +77,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long cause,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h
index de65f66ea64e..ec2df4bab302 100644
--- a/arch/parisc/include/asm/cacheflush.h
+++ b/arch/parisc/include/asm/cacheflush.h
@@ -142,6 +142,7 @@ static inline void kunmap(struct page *page)
 
 static inline void *kmap_atomic(struct page *page)
 {
+	preempt_disable();
 	pagefault_disable();
 	return page_address(page);
 }
@@ -150,6 +151,7 @@ static inline void __kunmap_atomic(void *addr)
 {
 	flush_kernel_dcache_page_addr(addr);
 	pagefault_enable();
+	preempt_enable();
 }
 
 #define kmap_atomic_prot(page, prot)	kmap_atomic(page)
diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h
index dbd13354ec41..0a90b965cccb 100644
--- a/arch/parisc/include/asm/cmpxchg.h
+++ b/arch/parisc/include/asm/cmpxchg.h
@@ -46,8 +46,6 @@ __xchg(unsigned long x, __volatile__ void *ptr, int size)
 #define xchg(ptr, x) \
 	((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
 
-#define __HAVE_ARCH_CMPXCHG	1
-
 /* bug catcher for when unsupported size is used - won't link */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
diff --git a/arch/parisc/include/asm/pci.h b/arch/parisc/include/asm/pci.h
index 20df2b04fc09..bf5e044281d6 100644
--- a/arch/parisc/include/asm/pci.h
+++ b/arch/parisc/include/asm/pci.h
@@ -196,25 +196,6 @@ static inline void pcibios_register_hba(struct pci_hba_data *x)
 /* export the pci_ DMA API in terms of the dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
 	return channel ? 15 : 14;
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 47ee620d15d2..6548fd1d2e62 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -26,9 +26,9 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/uaccess.h>
 
 #include <asm/assembly.h>
-#include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/traps.h>
@@ -800,7 +800,7 @@ void notrace handle_interruption(int code, struct pt_regs *regs)
 	     * unless pagefault_disable() was called before.
 	     */
 
-	    if (fault_space == 0 && !in_atomic())
+	    if (fault_space == 0 && !faulthandler_disabled())
 	    {
 		pdc_chassis_send_status(PDC_CHASSIS_DIRECT_PANIC);
 		parisc_terminate("Kernel Fault", regs, code, fault_address);
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index e5120e653240..15503adddf4f 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -15,8 +15,8 @@
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/traps.h>
 
 /* Various important other fields */
@@ -207,7 +207,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
 	int fault;
 	unsigned int flags;
 
-	if (in_atomic())
+	if (pagefault_disabled())
 		goto no_context;
 
 	tsk = current;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 190cc48abc0c..5ef27113b898 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -153,6 +153,8 @@ config PPC
 	select NO_BOOTMEM
 	select HAVE_GENERIC_RCU_GUP
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select EDAC_SUPPORT
+	select EDAC_ATOMIC_SCRUB
 
 config GENERIC_CSUM
 	def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 0efa8f90a8f1..3a510f4a6b68 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -19,6 +19,14 @@ config PPC_WERROR
 	depends on !PPC_DISABLE_WERROR
 	default y
 
+config STRICT_MM_TYPECHECKS
+	bool "Do extra type checking on mm types"
+	default n
+	help
+	  This option turns on extra type checking for some mm related types.
+
+	  If you don't know what this means, say N.
+
 config PRINT_STACK_DEPTH
 	int "Stack depth to print" if DEBUG_KERNEL
 	default 64
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 07a480861f78..05f464eb6952 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -66,7 +66,10 @@ endif
 UTS_MACHINE := $(OLDARCH)
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-override CC	+= -mlittle-endian -mno-strict-align
+override CC	+= -mlittle-endian
+ifneq ($(COMPILER),clang)
+override CC	+= -mno-strict-align
+endif
 override AS	+= -mlittle-endian
 override LD	+= -EL
 override CROSS32CC += -mlittle-endian
@@ -113,14 +116,14 @@ else
 endif
 endif
 
-CFLAGS-$(CONFIG_PPC64)	:= -mtraceback=no
+CFLAGS-$(CONFIG_PPC64)	:= $(call cc-option,-mtraceback=no)
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2,-mcall-aixdesc)
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2,$(call cc-option,-mcall-aixdesc))
 AFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mabi=elfv2)
 else
-CFLAGS-$(CONFIG_PPC64)	+= -mcall-aixdesc
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcall-aixdesc)
 endif
-CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcmodel=medium,-mminimal-toc)
+CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mcmodel=medium,$(call cc-option,-mminimal-toc))
 CFLAGS-$(CONFIG_PPC64)	+= $(call cc-option,-mno-pointers-to-nested-functions)
 CFLAGS-$(CONFIG_PPC32)	:= -ffixed-r2 $(MULTIPLEWORD)
 
@@ -160,7 +163,8 @@ asinstr := $(call as-instr,lis 9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
 
 KBUILD_CPPFLAGS	+= -Iarch/$(ARCH) $(asinstr)
 KBUILD_AFLAGS	+= -Iarch/$(ARCH) $(AFLAGS-y)
-KBUILD_CFLAGS	+= -msoft-float -pipe -Iarch/$(ARCH) $(CFLAGS-y)
+KBUILD_CFLAGS	+= $(call cc-option,-msoft-float)
+KBUILD_CFLAGS	+= -pipe -Iarch/$(ARCH) $(CFLAGS-y)
 CPP		= $(CC) -E $(KBUILD_CFLAGS)
 
 CHECKFLAGS	+= -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__
@@ -192,7 +196,7 @@ KBUILD_CFLAGS	+= $(call cc-option,-fno-dwarf2-cfi-asm)
 
 # Never use string load/store instructions as they are
 # often slow when they are implemented at all
-KBUILD_CFLAGS		+= -mno-string
+KBUILD_CFLAGS		+= $(call cc-option,-mno-string)
 
 ifeq ($(CONFIG_6xx),y)
 KBUILD_CFLAGS		+= -mcpu=powerpc
@@ -269,6 +273,21 @@ bootwrapper_install:
 %.dtb: scripts
 	$(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
+# Used to create 'merged defconfigs'
+# To use it $(call) it with the first argument as the base defconfig
+# and the second argument as a space separated list of .config files to merge,
+# without the .config suffix.
+define merge_into_defconfig
+	$(Q)$(CONFIG_SHELL) $(srctree)/scripts/kconfig/merge_config.sh \
+		-m -O $(objtree) $(srctree)/arch/$(ARCH)/configs/$(1) \
+		$(foreach config,$(2),$(srctree)/arch/$(ARCH)/configs/$(config).config)
+	+$(Q)$(MAKE) -f $(srctree)/Makefile olddefconfig
+endef
+
+PHONY += pseries_le_defconfig
+pseries_le_defconfig:
+	$(call merge_into_defconfig,pseries_defconfig,le)
+
 define archhelp
   @echo '* zImage          - Build default images selected by kernel config'
   @echo '  zImage.*        - Compressed kernel image (arch/$(ARCH)/boot/zImage.*)'
@@ -314,7 +333,8 @@ TOUT	:= .tmp_gas_check
 # - Require gcc 4.0 or above on 64-bit
 # - gcc-4.2.0 has issues compiling modules on 64-bit
 checkbin:
-	@if test "$(cc-version)" = "0304" ; then \
+	@if test "${COMPILER}" != "clang" \
+	    && test "$(cc-version)" = "0304" ; then \
 		if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \
 			echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \
 			echo 'correctly with gcc-3.4 and your version of binutils.'; \
@@ -322,13 +342,15 @@ checkbin:
 			false; \
 		fi ; \
 	fi
-	@if test "$(cc-version)" -lt "0400" \
+	@if test "${COMPILER}" != "clang" \
+	    && test "$(cc-version)" -lt "0400" \
 	    && test "x${CONFIG_PPC64}" = "xy" ; then \
                 echo -n "Sorry, GCC v4.0 or above is required to build " ; \
                 echo "the 64-bit powerpc kernel." ; \
                 false ; \
         fi
-	@if test "$(cc-fullversion)" = "040200" \
+	@if test "${COMPILER}" != "clang" \
+	    && test "$(cc-fullversion)" = "040200" \
 	    && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \
 		echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \
 		echo 'kernel with modules enabled.' ; \
@@ -336,6 +358,14 @@ checkbin:
 		echo 'disable kernel modules' ; \
 		false ; \
 	fi
+	@if test "x${CONFIG_CPU_LITTLE_ENDIAN}" = "xy" \
+	    && $(LD) --version | head -1 | grep ' 2\.24$$' >/dev/null ; then \
+		echo -n '*** binutils 2.24 miscompiles weak symbols ' ; \
+		echo 'in some circumstances.' ; \
+		echo -n '*** Please use a different binutils version.' ; \
+		false ; \
+	fi
+
 
 CLEAN_FILES += $(TOUT)
 
diff --git a/arch/powerpc/boot/dts/b4qds.dtsi b/arch/powerpc/boot/dts/b4qds.dtsi
index 24ed80dc2120..559d00657fb5 100644
--- a/arch/powerpc/boot/dts/b4qds.dtsi
+++ b/arch/powerpc/boot/dts/b4qds.dtsi
@@ -106,6 +106,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -116,6 +124,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi
index 86161ae6c966..1ea8602e4345 100644
--- a/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4420si-post.dtsi
@@ -80,20 +80,9 @@
 		compatible = "fsl,b4420-device-config", "fsl,qoriq-device-config-2.0";
 	};
 
-/include/ "qoriq-clockgen2.dtsi"
 	global-utilities@e1000 {
-		compatible = "fsl,b4420-clockgen", "fsl,qoriq-clockgen-2.0";
-
-		mux0: mux0@0 {
-			#clock-cells = <0>;
-			reg = <0x0 0x4>;
-			compatible = "fsl,qoriq-core-mux-2.0";
-			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
-				<&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll0-div4",
-				"pll1", "pll1-div2", "pll1-div4";
-			clock-output-names = "cmux0";
-		};
+		compatible = "fsl,b4420-clockgen", "fsl,b4-clockgen",
+			      "fsl,qoriq-clockgen-2.0";
 	};
 
 	rcpm: global-utilities@e2000 {
diff --git a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
index f35e9e0a5445..9ba904be39ee 100644
--- a/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4860si-post.dtsi
@@ -167,6 +167,75 @@
 	};
 };
 
+&qportals {
+	qportal14: qman-portal@38000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x38000 0x4000>, <0x100e000 0x1000>;
+		interrupts = <132 0x2 0 0>;
+		cell-index = <0xe>;
+	};
+	qportal15: qman-portal@3c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x3c000 0x4000>, <0x100f000 0x1000>;
+		interrupts = <134 0x2 0 0>;
+		cell-index = <0xf>;
+	};
+	qportal16: qman-portal@40000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x40000 0x4000>, <0x1010000 0x1000>;
+		interrupts = <136 0x2 0 0>;
+		cell-index = <0x10>;
+	};
+	qportal17: qman-portal@44000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x44000 0x4000>, <0x1011000 0x1000>;
+		interrupts = <138 0x2 0 0>;
+		cell-index = <0x11>;
+	};
+	qportal18: qman-portal@48000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x48000 0x4000>, <0x1012000 0x1000>;
+		interrupts = <140 0x2 0 0>;
+		cell-index = <0x12>;
+	};
+	qportal19: qman-portal@4c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4c000 0x4000>, <0x1013000 0x1000>;
+		interrupts = <142 0x2 0 0>;
+		cell-index = <0x13>;
+	};
+	qportal20: qman-portal@50000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x50000 0x4000>, <0x1014000 0x1000>;
+		interrupts = <144 0x2 0 0>;
+		cell-index = <0x14>;
+	};
+	qportal21: qman-portal@54000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x54000 0x4000>, <0x1015000 0x1000>;
+		interrupts = <146 0x2 0 0>;
+		cell-index = <0x15>;
+	};
+	qportal22: qman-portal@58000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x58000 0x4000>, <0x1016000 0x1000>;
+		interrupts = <148 0x2 0 0>;
+		cell-index = <0x16>;
+	};
+	qportal23: qman-portal@5c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x5c000 0x4000>, <0x1017000 0x1000>;
+		interrupts = <150 0x2 0 0>;
+		cell-index = <0x17>;
+	};
+	qportal24: qman-portal@60000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x60000 0x4000>, <0x1018000 0x1000>;
+		interrupts = <152 0x2 0 0>;
+		cell-index = <0x18>;
+	};
+};
+
 &soc {
 	ddr2: memory-controller@9000 {
 		compatible = "fsl,qoriq-memory-controller-v4.5", "fsl,qoriq-memory-controller";
@@ -182,20 +251,9 @@
 		compatible = "fsl,b4860-device-config", "fsl,qoriq-device-config-2.0";
 	};
 
-/include/ "qoriq-clockgen2.dtsi"
 	global-utilities@e1000 {
-		compatible = "fsl,b4860-clockgen", "fsl,qoriq-clockgen-2.0";
-
-		mux0: mux0@0 {
-			#clock-cells = <0>;
-			reg = <0x0 0x4>;
-			compatible = "fsl,qoriq-core-mux-2.0";
-			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
-				<&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll0-div4",
-				"pll1", "pll1-div2", "pll1-div4";
-			clock-output-names = "cmux0";
-		};
+		compatible = "fsl,b4860-clockgen", "fsl,b4-clockgen",
+			      "fsl,qoriq-clockgen-2.0";
 	};
 
 	rcpm: global-utilities@e2000 {
diff --git a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
index 73136c0029d2..603910ac1db0 100644
--- a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -210,6 +220,97 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+	qportal10: qman-portal@28000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x28000 0x4000>, <0x100a000 0x1000>;
+		interrupts = <124 0x2 0 0>;
+		cell-index = <0xa>;
+	};
+	qportal11: qman-portal@2c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x2c000 0x4000>, <0x100b000 0x1000>;
+		interrupts = <126 0x2 0 0>;
+		cell-index = <0xb>;
+	};
+	qportal12: qman-portal@30000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x30000 0x4000>, <0x100c000 0x1000>;
+		interrupts = <128 0x2 0 0>;
+		cell-index = <0xc>;
+	};
+	qportal13: qman-portal@34000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x34000 0x4000>, <0x100d000 0x1000>;
+		interrupts = <130 0x2 0 0>;
+		cell-index = <0xd>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -296,9 +397,21 @@
 		fsl,liodn-bits = <12>;
 	};
 
+/include/ "qoriq-clockgen2.dtsi"
 	clockgen: global-utilities@e1000 {
 		compatible = "fsl,b4-clockgen", "fsl,qoriq-clockgen-2.0";
 		reg = <0xe1000 0x1000>;
+
+		mux0: mux0@0 {
+			#clock-cells = <0>;
+			reg = <0x0 0x4>;
+			compatible = "fsl,qoriq-core-mux-2.0";
+			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
+				<&pll1 0>, <&pll1 1>, <&pll1 2>;
+			clock-names = "pll0", "pll0-div2", "pll0-div4",
+				"pll1", "pll1-div2", "pll1-div4";
+			clock-output-names = "cmux0";
+		};
 	};
 
 	rcpm: global-utilities@e2000 {
@@ -343,6 +456,11 @@
 /include/ "qoriq-duart-1.dtsi"
 /include/ "qoriq-sec5.3-0.dtsi"
 
+/include/ "qoriq-qman3.dtsi"
+	qman: qman@318000 {
+		interrupts = <16 2 1 28>;
+	};
+
 /include/ "qoriq-bman1.dtsi"
 	bman: bman@31a000 {
 		interrupts = <16 2 1 29>;
diff --git a/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi
index 7780f21430cb..da6d3fc6ba41 100644
--- a/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p1023si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -102,6 +112,31 @@
 	};
 };
 
+&qportals {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x100000 0x1000>;
+		interrupts = <29 2 0 0>;
+		cell-index = <0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x101000 0x1000>;
+		interrupts = <31 2 0 0>;
+		cell-index = <1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x102000 0x1000>;
+		interrupts = <33 2 0 0>;
+		cell-index = <2>;
+	};
+};
+
 &bportals {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -248,6 +283,14 @@
 /include/ "pq3-mpic.dtsi"
 /include/ "pq3-mpic-timer-B.dtsi"
 
+	qman: qman@88000 {
+		compatible = "fsl,qman";
+		reg = <0x88000 0x1000>;
+		interrupts = <16 2 0 0>;
+		fsl,qman-portals = <&qportals>;
+		memory-region = <&qman_fqd &qman_pfdr>;
+	};
+
 	bman: bman@8a000 {
 		compatible = "fsl,bman";
 		reg = <0x8a000 0x1000>;
diff --git a/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi b/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi
index f2feacfd9a25..04ad177b6a12 100644
--- a/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p2041si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	compatible = "fsl,p2041-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -223,6 +233,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -370,6 +382,7 @@
 
 /include/ "qoriq-esdhc-0.dtsi"
 	sdhc@114000 {
+		compatible = "fsl,p2041-esdhc", "fsl,esdhc";
 		fsl,iommu-parent = <&pamu1>;
 		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
 		sdhci,auto-cmd12;
@@ -415,5 +428,6 @@ crypto: crypto@300000 {
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi b/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi
index d6fea37395ad..2cab18af6df2 100644
--- a/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p3041si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	compatible = "fsl,p3041-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -250,6 +260,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -397,6 +409,7 @@
 
 /include/ "qoriq-esdhc-0.dtsi"
 	sdhc@114000 {
+		compatible = "fsl,p3041-esdhc", "fsl,esdhc";
 		fsl,iommu-parent = <&pamu1>;
 		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
 		sdhci,auto-cmd12;
@@ -442,5 +455,6 @@ crypto: crypto@300000 {
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi b/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi
index 89482c9b2301..dfc76bc41cb2 100644
--- a/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p4080si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10 0>;
+};
+
 &lbc {
 	compatible = "fsl,p4080-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -250,6 +260,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -469,6 +481,7 @@
 
 /include/ "qoriq-esdhc-0.dtsi"
 	sdhc@114000 {
+		compatible = "fsl,p4080-esdhc", "fsl,esdhc";
 		fsl,iommu-parent = <&pamu1>;
 		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
 		voltage-ranges = <3300 3300>;
@@ -498,5 +511,6 @@ crypto: crypto@300000 {
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi b/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
index 6e04851e2fc9..b77923ad72cf 100644
--- a/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p5020si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &lbc {
 	compatible = "fsl,p5020-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -247,6 +257,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -384,6 +396,7 @@
 
 /include/ "qoriq-esdhc-0.dtsi"
 	sdhc@114000 {
+		compatible = "fsl,p5020-esdhc", "fsl,esdhc";
 		fsl,iommu-parent = <&pamu1>;
 		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
 		sdhci,auto-cmd12;
@@ -428,6 +441,7 @@
 		fsl,iommu-parent = <&pamu1>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 
 /include/ "qoriq-raid1.0-0.dtsi"
diff --git a/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi b/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi
index 5e44dfa1e1a5..6d214526b81b 100644
--- a/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/p5040si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &lbc {
 	compatible = "fsl,p5040-elbc", "fsl,elbc", "simple-bus";
 	interrupts = <25 2 0 0>;
@@ -202,6 +212,8 @@
 
 /include/ "qoriq-bman1-portals.dtsi"
 
+/include/ "qoriq-qman1-portals.dtsi"
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -362,6 +374,7 @@
 
 /include/ "qoriq-esdhc-0.dtsi"
 	sdhc@114000 {
+		compatible = "fsl,p5040-esdhc", "fsl,esdhc";
 		fsl,iommu-parent = <&pamu2>;
 		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
 		sdhci,auto-cmd12;
@@ -407,5 +420,6 @@
 		fsl,iommu-parent = <&pamu4>;
 	};
 
+/include/ "qoriq-qman1.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi
index 05d51acafa67..e77e4b4ed53b 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-qman1-portals.dtsi
@@ -41,61 +41,61 @@
 		compatible = "fsl,qman-portal";
 		reg = <0x0 0x4000>, <0x100000 0x1000>;
 		interrupts = <104 2 0 0>;
-		fsl,qman-channel-id = <0x0>;
+		cell-index = <0x0>;
 	};
 	qportal1: qman-portal@4000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x4000 0x4000>, <0x101000 0x1000>;
 		interrupts = <106 2 0 0>;
-		fsl,qman-channel-id = <1>;
+		cell-index = <1>;
 	};
 	qportal2: qman-portal@8000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x8000 0x4000>, <0x102000 0x1000>;
 		interrupts = <108 2 0 0>;
-		fsl,qman-channel-id = <2>;
+		cell-index = <2>;
 	};
 	qportal3: qman-portal@c000 {
 		compatible = "fsl,qman-portal";
 		reg = <0xc000 0x4000>, <0x103000 0x1000>;
 		interrupts = <110 2 0 0>;
-		fsl,qman-channel-id = <3>;
+		cell-index = <3>;
 	};
 	qportal4: qman-portal@10000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x10000 0x4000>, <0x104000 0x1000>;
 		interrupts = <112 2 0 0>;
-		fsl,qman-channel-id = <4>;
+		cell-index = <4>;
 	};
 	qportal5: qman-portal@14000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x14000 0x4000>, <0x105000 0x1000>;
 		interrupts = <114 2 0 0>;
-		fsl,qman-channel-id = <5>;
+		cell-index = <5>;
 	};
 	qportal6: qman-portal@18000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x18000 0x4000>, <0x106000 0x1000>;
 		interrupts = <116 2 0 0>;
-		fsl,qman-channel-id = <6>;
+		cell-index = <6>;
 	};
 
 	qportal7: qman-portal@1c000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x1c000 0x4000>, <0x107000 0x1000>;
 		interrupts = <118 2 0 0>;
-		fsl,qman-channel-id = <7>;
+		cell-index = <7>;
 	};
 	qportal8: qman-portal@20000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x20000 0x4000>, <0x108000 0x1000>;
 		interrupts = <120 2 0 0>;
-		fsl,qman-channel-id = <8>;
+		cell-index = <8>;
 	};
 	qportal9: qman-portal@24000 {
 		compatible = "fsl,qman-portal";
 		reg = <0x24000 0x4000>, <0x109000 0x1000>;
 		interrupts = <122 2 0 0>;
-		fsl,qman-channel-id = <9>;
+		cell-index = <9>;
 	};
 };
diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
new file mode 100644
index 000000000000..df1f068a5376
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
@@ -0,0 +1,330 @@
+/*
+ * T1023 Silicon/SoC Device Tree Source (post include)
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+&ifc {
+	#address-cells = <2>;
+	#size-cells = <1>;
+	compatible = "fsl,ifc", "simple-bus";
+	interrupts = <25 2 0 0>;
+};
+
+&pci0 {
+	compatible = "fsl,t1023-pcie", "fsl,qoriq-pcie-v2.4", "fsl,qoriq-pcie";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0x0 0xff>;
+	interrupts = <20 2 0 0>;
+	fsl,iommu-parent = <&pamu0>;
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <20 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0 0 1 &mpic 40 1 0 0
+			0000 0 0 2 &mpic 1 1 0 0
+			0000 0 0 3 &mpic 2 1 0 0
+			0000 0 0 4 &mpic 3 1 0 0
+			>;
+	};
+};
+
+&pci1 {
+	compatible = "fsl,t1023-pcie", "fsl,qoriq-pcie-v2.4", "fsl,qoriq-pcie";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0 0xff>;
+	interrupts = <21 2 0 0>;
+	fsl,iommu-parent = <&pamu0>;
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <21 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0 0 1 &mpic 41 1 0 0
+			0000 0 0 2 &mpic 5 1 0 0
+			0000 0 0 3 &mpic 6 1 0 0
+			0000 0 0 4 &mpic 7 1 0 0
+			>;
+	};
+};
+
+&pci2 {
+	compatible = "fsl,t1023-pcie", "fsl,qoriq-pcie-v2.4", "fsl,qoriq-pcie";
+	device_type = "pci";
+	#size-cells = <2>;
+	#address-cells = <3>;
+	bus-range = <0x0 0xff>;
+	interrupts = <22 2 0 0>;
+	fsl,iommu-parent = <&pamu0>;
+	pcie@0 {
+		reg = <0 0 0 0 0>;
+		#interrupt-cells = <1>;
+		#size-cells = <2>;
+		#address-cells = <3>;
+		device_type = "pci";
+		interrupts = <22 2 0 0>;
+		interrupt-map-mask = <0xf800 0 0 7>;
+		interrupt-map = <
+			/* IDSEL 0x0 */
+			0000 0 0 1 &mpic 42 1 0 0
+			0000 0 0 2 &mpic 9 1 0 0
+			0000 0 0 3 &mpic 10 1 0 0
+			0000 0 0 4 &mpic 11 1 0 0
+			>;
+	};
+};
+
+&dcsr {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	compatible = "fsl,dcsr", "simple-bus";
+
+	dcsr-epu@0 {
+		compatible = "fsl,t1023-dcsr-epu", "fsl,dcsr-epu";
+		interrupts = <52 2 0 0
+			      84 2 0 0
+			      85 2 0 0>;
+		reg = <0x0 0x1000>;
+	};
+	dcsr-npc {
+		compatible = "fsl,t1023-dcsr-cnpc", "fsl,dcsr-cnpc";
+		reg = <0x1000 0x1000 0x1002000 0x10000>;
+	};
+	dcsr-nxc@2000 {
+		compatible = "fsl,dcsr-nxc";
+		reg = <0x2000 0x1000>;
+	};
+	dcsr-corenet {
+		compatible = "fsl,dcsr-corenet";
+		reg = <0x8000 0x1000 0x1A000 0x1000>;
+	};
+	dcsr-ocn@11000 {
+		compatible = "fsl,t1023-dcsr-ocn", "fsl,dcsr-ocn";
+		reg = <0x11000 0x1000>;
+	};
+	dcsr-ddr@12000 {
+		compatible = "fsl,dcsr-ddr";
+		dev-handle = <&ddr1>;
+		reg = <0x12000 0x1000>;
+	};
+	dcsr-nal@18000 {
+		compatible = "fsl,t1023-dcsr-nal", "fsl,dcsr-nal";
+		reg = <0x18000 0x1000>;
+	};
+	dcsr-rcpm@22000 {
+		compatible = "fsl,t1023-dcsr-rcpm", "fsl,dcsr-rcpm";
+		reg = <0x22000 0x1000>;
+	};
+	dcsr-snpc@30000 {
+		compatible = "fsl,t1023-dcsr-snpc", "fsl,dcsr-snpc";
+		reg = <0x30000 0x1000 0x1022000 0x10000>;
+	};
+	dcsr-snpc@31000 {
+		compatible = "fsl,t1023-dcsr-snpc", "fsl,dcsr-snpc";
+		reg = <0x31000 0x1000 0x1042000 0x10000>;
+	};
+	dcsr-cpu-sb-proxy@100000 {
+		compatible = "fsl,dcsr-e5500-sb-proxy", "fsl,dcsr-cpu-sb-proxy";
+		cpu-handle = <&cpu0>;
+		reg = <0x100000 0x1000 0x101000 0x1000>;
+	};
+	dcsr-cpu-sb-proxy@108000 {
+		compatible = "fsl,dcsr-e5500-sb-proxy", "fsl,dcsr-cpu-sb-proxy";
+		cpu-handle = <&cpu1>;
+		reg = <0x108000 0x1000 0x109000 0x1000>;
+	};
+};
+
+&soc {
+	#address-cells = <1>;
+	#size-cells = <1>;
+	device_type = "soc";
+	compatible = "simple-bus";
+
+	soc-sram-error {
+		compatible = "fsl,soc-sram-error";
+		interrupts = <16 2 1 29>;
+	};
+
+	corenet-law@0 {
+		compatible = "fsl,corenet-law";
+		reg = <0x0 0x1000>;
+		fsl,num-laws = <16>;
+	};
+
+	ddr1: memory-controller@8000 {
+		compatible = "fsl,qoriq-memory-controller-v5.0",
+				"fsl,qoriq-memory-controller";
+		reg = <0x8000 0x1000>;
+		interrupts = <16 2 1 23>;
+	};
+
+	cpc: l3-cache-controller@10000 {
+		compatible = "fsl,t1023-l3-cache-controller", "cache";
+		reg = <0x10000 0x1000>;
+		interrupts = <16 2 1 27>;
+	};
+
+	corenet-cf@18000 {
+		compatible = "fsl,corenet2-cf";
+		reg = <0x18000 0x1000>;
+		interrupts = <16 2 1 31>;
+	};
+
+	iommu@20000 {
+		compatible = "fsl,pamu-v1.0", "fsl,pamu";
+		reg = <0x20000 0x1000>;
+		ranges = <0 0x20000 0x1000>;
+		#address-cells = <1>;
+		#size-cells = <1>;
+		interrupts = <
+			24 2 0 0
+			16 2 1 30>;
+		pamu0: pamu@0 {
+			reg = <0 0x1000>;
+			fsl,primary-cache-geometry = <128 1>;
+			fsl,secondary-cache-geometry = <32 2>;
+		};
+	};
+
+/include/ "qoriq-mpic.dtsi"
+
+	guts: global-utilities@e0000 {
+		compatible = "fsl,t1023-device-config", "fsl,qoriq-device-config-2.0";
+		reg = <0xe0000 0xe00>;
+		fsl,has-rstcr;
+		fsl,liodn-bits = <12>;
+	};
+
+/include/ "qoriq-clockgen2.dtsi"
+	global-utilities@e1000 {
+		compatible = "fsl,t1023-clockgen", "fsl,qoriq-clockgen-2.0";
+		mux0: mux0@0 {
+			#clock-cells = <0>;
+			reg = <0x0 4>;
+			compatible = "fsl,core-mux-clock";
+			clocks = <&pll0 0>, <&pll0 1>;
+			clock-names = "pll0_0", "pll0_1";
+			clock-output-names = "cmux0";
+		};
+		mux1: mux1@20 {
+			#clock-cells = <0>;
+			reg = <0x20 4>;
+			compatible = "fsl,core-mux-clock";
+			clocks = <&pll0 0>, <&pll0 1>;
+			clock-names = "pll0_0", "pll0_1";
+			clock-output-names = "cmux1";
+		};
+	};
+
+	rcpm: global-utilities@e2000 {
+		compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0";
+		reg = <0xe2000 0x1000>;
+	};
+
+	sfp: sfp@e8000 {
+		compatible = "fsl,t1023-sfp";
+		reg = <0xe8000 0x1000>;
+	};
+
+	serdes: serdes@ea000 {
+		compatible = "fsl,t1023-serdes";
+		reg = <0xea000 0x4000>;
+	};
+
+	scfg: global-utilities@fc000 {
+		compatible = "fsl,t1023-scfg";
+		reg = <0xfc000 0x1000>;
+	};
+
+/include/ "elo3-dma-0.dtsi"
+/include/ "elo3-dma-1.dtsi"
+
+/include/ "qoriq-espi-0.dtsi"
+	spi@110000 {
+		fsl,espi-num-chipselects = <4>;
+	};
+
+/include/ "qoriq-esdhc-0.dtsi"
+	sdhc@114000 {
+		compatible = "fsl,t1023-esdhc", "fsl,esdhc";
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x530>; /* eSDHCLIODNR */
+		sdhci,auto-cmd12;
+		no-1-8-v;
+	};
+/include/ "qoriq-i2c-0.dtsi"
+/include/ "qoriq-i2c-1.dtsi"
+/include/ "qoriq-duart-0.dtsi"
+/include/ "qoriq-duart-1.dtsi"
+/include/ "qoriq-gpio-0.dtsi"
+/include/ "qoriq-gpio-1.dtsi"
+/include/ "qoriq-gpio-2.dtsi"
+/include/ "qoriq-gpio-3.dtsi"
+/include/ "qoriq-usb2-mph-0.dtsi"
+	usb0: usb@210000 {
+		compatible = "fsl-usb2-mph-v2.5", "fsl-usb2-mph";
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x520>; /* USB1LIODNR */
+		phy_type = "utmi";
+		port0;
+	};
+/include/ "qoriq-usb2-dr-0.dtsi"
+	usb1: usb@211000 {
+		compatible = "fsl-usb2-dr-v2.5", "fsl-usb2-dr";
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x524>; /* USB2LIODNR */
+		dr_mode = "host";
+		phy_type = "utmi";
+	};
+/include/ "qoriq-sata2-0.dtsi"
+	sata@220000 {
+		fsl,iommu-parent = <&pamu0>;
+		fsl,liodn-reg = <&guts 0x550>; /* SATA1LIODNR */
+	};
+
+/include/ "qoriq-sec5.0-0.dtsi"
+};
diff --git a/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
new file mode 100644
index 000000000000..95e3af8d768e
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/t1024si-post.dtsi
@@ -0,0 +1,100 @@
+/*
+ * T1024 Silicon/SoC Device Tree Source (post include)
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "t1023si-post.dtsi"
+
+/ {
+	aliases {
+		vga = &display;
+		display = &display;
+	};
+
+	qe:qe@ffe140000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		device_type = "qe";
+		compatible = "fsl,qe";
+		ranges = <0x0 0xf 0xfe140000 0x40000>;
+		reg = <0xf 0xfe140000 0 0x480>;
+		fsl,qe-num-riscs = <1>;
+		fsl,qe-num-snums = <28>;
+		brg-frequency = <0>;
+		bus-frequency = <0>;
+	};
+};
+
+&soc {
+	display:display@180000 {
+		compatible = "fsl,t1024-diu", "fsl,diu";
+		reg = <0x180000 1000>;
+		interrupts = <74 2 0 0>;
+	};
+};
+
+&qe {
+	qeic: interrupt-controller@80 {
+		interrupt-controller;
+		compatible = "fsl,qe-ic";
+		#address-cells = <0>;
+		#interrupt-cells = <1>;
+		reg = <0x80 0x80>;
+		interrupts = <95 2 0 0  94 2 0 0>; //high:79 low:78
+	};
+
+	ucc@2000 {
+		cell-index = <1>;
+		reg = <0x2000 0x200>;
+		interrupts = <32>;
+		interrupt-parent = <&qeic>;
+	};
+
+	ucc@2200 {
+		cell-index = <3>;
+		reg = <0x2200 0x200>;
+		interrupts = <34>;
+		interrupt-parent = <&qeic>;
+	};
+
+	muram@10000 {
+		#address-cells = <1>;
+		#size-cells = <1>;
+		compatible = "fsl,qe-muram", "fsl,cpm-muram";
+		ranges = <0x0 0x10000 0x6000>;
+
+		data-only@0 {
+			compatible = "fsl,qe-muram-data", "fsl,cpm-muram-data";
+			reg = <0x0 0x6000>;
+		};
+	};
+};
diff --git a/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi b/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
new file mode 100644
index 000000000000..1f1a9f8474d5
--- /dev/null
+++ b/arch/powerpc/boot/dts/fsl/t102xsi-pre.dtsi
@@ -0,0 +1,87 @@
+/*
+ * T1024/T1023 Silicon/SoC Device Tree Source (pre include)
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/dts-v1/;
+
+/include/ "e5500_power_isa.dtsi"
+
+/ {
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	aliases {
+		ccsr = &soc;
+		dcsr = &dcsr;
+
+		dma0 = &dma0;
+		dma1 = &dma1;
+		serial0 = &serial0;
+		serial1 = &serial1;
+		serial2 = &serial2;
+		serial3 = &serial3;
+		pci0 = &pci0;
+		pci1 = &pci1;
+		pci2 = &pci2;
+		usb0 = &usb0;
+		usb1 = &usb1;
+		sdhc = &sdhc;
+
+		crypto = &crypto;
+	};
+
+	cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu0: PowerPC,e5500@0 {
+			device_type = "cpu";
+			reg = <0>;
+			clocks = <&mux0>;
+			next-level-cache = <&L2_1>;
+			L2_1: l2-cache {
+				next-level-cache = <&cpc>;
+			};
+		};
+		cpu1: PowerPC,e5500@1 {
+			device_type = "cpu";
+			reg = <1>;
+			clocks = <&mux1>;
+			next-level-cache = <&L2_2>;
+			L2_2: l2-cache {
+				next-level-cache = <&cpc>;
+			};
+		};
+	};
+};
diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
index 5cc01be5b152..9e9f7e201d43 100644
--- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -280,6 +290,73 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -463,5 +540,6 @@
 		fsl,liodn-reg = <&guts 0x554>; /* SATA2LIODNR */
 	};
 /include/ "qoriq-sec5.0-0.dtsi"
+/include/ "qoriq-qman3.dtsi"
 /include/ "qoriq-bman1.dtsi"
 };
diff --git a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
index 86bdaf6cbd14..32c790ae7fde 100644
--- a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -326,6 +336,121 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+	qportal10: qman-portal@28000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x28000 0x4000>, <0x100a000 0x1000>;
+		interrupts = <124 0x2 0 0>;
+		cell-index = <0xa>;
+	};
+	qportal11: qman-portal@2c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x2c000 0x4000>, <0x100b000 0x1000>;
+		interrupts = <126 0x2 0 0>;
+		cell-index = <0xb>;
+	};
+	qportal12: qman-portal@30000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x30000 0x4000>, <0x100c000 0x1000>;
+		interrupts = <128 0x2 0 0>;
+		cell-index = <0xc>;
+	};
+	qportal13: qman-portal@34000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x34000 0x4000>, <0x100d000 0x1000>;
+		interrupts = <130 0x2 0 0>;
+		cell-index = <0xd>;
+	};
+	qportal14: qman-portal@38000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x38000 0x4000>, <0x100e000 0x1000>;
+		interrupts = <132 0x2 0 0>;
+		cell-index = <0xe>;
+	};
+	qportal15: qman-portal@3c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x3c000 0x4000>, <0x100f000 0x1000>;
+		interrupts = <134 0x2 0 0>;
+		cell-index = <0xf>;
+	};
+	qportal16: qman-portal@40000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x40000 0x4000>, <0x1010000 0x1000>;
+		interrupts = <136 0x2 0 0>;
+		cell-index = <0x10>;
+	};
+	qportal17: qman-portal@44000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x44000 0x4000>, <0x1011000 0x1000>;
+		interrupts = <138 0x2 0 0>;
+		cell-index = <0x11>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -417,7 +542,7 @@
 			compatible = "fsl,qoriq-core-mux-2.0";
 			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
 				 <&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll1-div4",
+			clock-names = "pll0", "pll0-div2", "pll0-div4",
 				"pll1", "pll1-div2", "pll1-div4";
 			clock-output-names = "cmux0";
 		};
@@ -428,7 +553,7 @@
 			compatible = "fsl,qoriq-core-mux-2.0";
 			clocks = <&pll0 0>, <&pll0 1>, <&pll0 2>,
 				 <&pll1 0>, <&pll1 1>, <&pll1 2>;
-			clock-names = "pll0", "pll0-div2", "pll1-div4",
+			clock-names = "pll0", "pll0-div2", "pll0-div4",
 				"pll1", "pll1-div2", "pll1-div4";
 			clock-output-names = "cmux1";
 		};
@@ -502,6 +627,7 @@
 		phy_type = "utmi";
 	};
 /include/ "qoriq-sec5.2-0.dtsi"
+/include/ "qoriq-qman3.dtsi"
 /include/ "qoriq-bman1.dtsi"
 
 	L2_1: l2-cache-controller@c20000 {
diff --git a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
index 4d4f25895d8c..d806360d0f64 100644
--- a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
+++ b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi
@@ -37,6 +37,16 @@
 	alloc-ranges = <0 0 0x10000 0>;
 };
 
+&qman_fqd {
+	compatible = "fsl,qman-fqd";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
+&qman_pfdr {
+	compatible = "fsl,qman-pfdr";
+	alloc-ranges = <0 0 0x10000 0>;
+};
+
 &ifc {
 	#address-cells = <2>;
 	#size-cells = <1>;
@@ -556,6 +566,313 @@
 	};
 };
 
+&qportals {
+	#address-cells = <0x1>;
+	#size-cells = <0x1>;
+	compatible = "simple-bus";
+
+	qportal0: qman-portal@0 {
+		compatible = "fsl,qman-portal";
+		reg = <0x0 0x4000>, <0x1000000 0x1000>;
+		interrupts = <104 0x2 0 0>;
+		cell-index = <0x0>;
+	};
+	qportal1: qman-portal@4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4000 0x4000>, <0x1001000 0x1000>;
+		interrupts = <106 0x2 0 0>;
+		cell-index = <0x1>;
+	};
+	qportal2: qman-portal@8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8000 0x4000>, <0x1002000 0x1000>;
+		interrupts = <108 0x2 0 0>;
+		cell-index = <0x2>;
+	};
+	qportal3: qman-portal@c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc000 0x4000>, <0x1003000 0x1000>;
+		interrupts = <110 0x2 0 0>;
+		cell-index = <0x3>;
+	};
+	qportal4: qman-portal@10000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x10000 0x4000>, <0x1004000 0x1000>;
+		interrupts = <112 0x2 0 0>;
+		cell-index = <0x4>;
+	};
+	qportal5: qman-portal@14000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x14000 0x4000>, <0x1005000 0x1000>;
+		interrupts = <114 0x2 0 0>;
+		cell-index = <0x5>;
+	};
+	qportal6: qman-portal@18000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x18000 0x4000>, <0x1006000 0x1000>;
+		interrupts = <116 0x2 0 0>;
+		cell-index = <0x6>;
+	};
+	qportal7: qman-portal@1c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x1c000 0x4000>, <0x1007000 0x1000>;
+		interrupts = <118 0x2 0 0>;
+		cell-index = <0x7>;
+	};
+	qportal8: qman-portal@20000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x20000 0x4000>, <0x1008000 0x1000>;
+		interrupts = <120 0x2 0 0>;
+		cell-index = <0x8>;
+	};
+	qportal9: qman-portal@24000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x24000 0x4000>, <0x1009000 0x1000>;
+		interrupts = <122 0x2 0 0>;
+		cell-index = <0x9>;
+	};
+	qportal10: qman-portal@28000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x28000 0x4000>, <0x100a000 0x1000>;
+		interrupts = <124 0x2 0 0>;
+		cell-index = <0xa>;
+	};
+	qportal11: qman-portal@2c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x2c000 0x4000>, <0x100b000 0x1000>;
+		interrupts = <126 0x2 0 0>;
+		cell-index = <0xb>;
+	};
+	qportal12: qman-portal@30000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x30000 0x4000>, <0x100c000 0x1000>;
+		interrupts = <128 0x2 0 0>;
+		cell-index = <0xc>;
+	};
+	qportal13: qman-portal@34000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x34000 0x4000>, <0x100d000 0x1000>;
+		interrupts = <130 0x2 0 0>;
+		cell-index = <0xd>;
+	};
+	qportal14: qman-portal@38000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x38000 0x4000>, <0x100e000 0x1000>;
+		interrupts = <132 0x2 0 0>;
+		cell-index = <0xe>;
+	};
+	qportal15: qman-portal@3c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x3c000 0x4000>, <0x100f000 0x1000>;
+		interrupts = <134 0x2 0 0>;
+		cell-index = <0xf>;
+	};
+	qportal16: qman-portal@40000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x40000 0x4000>, <0x1010000 0x1000>;
+		interrupts = <136 0x2 0 0>;
+		cell-index = <0x10>;
+	};
+	qportal17: qman-portal@44000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x44000 0x4000>, <0x1011000 0x1000>;
+		interrupts = <138 0x2 0 0>;
+		cell-index = <0x11>;
+	};
+	qportal18: qman-portal@48000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x48000 0x4000>, <0x1012000 0x1000>;
+		interrupts = <140 0x2 0 0>;
+		cell-index = <0x12>;
+	};
+	qportal19: qman-portal@4c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x4c000 0x4000>, <0x1013000 0x1000>;
+		interrupts = <142 0x2 0 0>;
+		cell-index = <0x13>;
+	};
+	qportal20: qman-portal@50000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x50000 0x4000>, <0x1014000 0x1000>;
+		interrupts = <144 0x2 0 0>;
+		cell-index = <0x14>;
+	};
+	qportal21: qman-portal@54000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x54000 0x4000>, <0x1015000 0x1000>;
+		interrupts = <146 0x2 0 0>;
+		cell-index = <0x15>;
+	};
+	qportal22: qman-portal@58000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x58000 0x4000>, <0x1016000 0x1000>;
+		interrupts = <148 0x2 0 0>;
+		cell-index = <0x16>;
+	};
+	qportal23: qman-portal@5c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x5c000 0x4000>, <0x1017000 0x1000>;
+		interrupts = <150 0x2 0 0>;
+		cell-index = <0x17>;
+	};
+	qportal24: qman-portal@60000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x60000 0x4000>, <0x1018000 0x1000>;
+		interrupts = <152 0x2 0 0>;
+		cell-index = <0x18>;
+	};
+	qportal25: qman-portal@64000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x64000 0x4000>, <0x1019000 0x1000>;
+		interrupts = <154 0x2 0 0>;
+		cell-index = <0x19>;
+	};
+	qportal26: qman-portal@68000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x68000 0x4000>, <0x101a000 0x1000>;
+		interrupts = <156 0x2 0 0>;
+		cell-index = <0x1a>;
+	};
+	qportal27: qman-portal@6c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x6c000 0x4000>, <0x101b000 0x1000>;
+		interrupts = <158 0x2 0 0>;
+		cell-index = <0x1b>;
+	};
+	qportal28: qman-portal@70000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x70000 0x4000>, <0x101c000 0x1000>;
+		interrupts = <160 0x2 0 0>;
+		cell-index = <0x1c>;
+	};
+	qportal29: qman-portal@74000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x74000 0x4000>, <0x101d000 0x1000>;
+		interrupts = <162 0x2 0 0>;
+		cell-index = <0x1d>;
+	};
+	qportal30: qman-portal@78000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x78000 0x4000>, <0x101e000 0x1000>;
+		interrupts = <164 0x2 0 0>;
+		cell-index = <0x1e>;
+	};
+	qportal31: qman-portal@7c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x7c000 0x4000>, <0x101f000 0x1000>;
+		interrupts = <166 0x2 0 0>;
+		cell-index = <0x1f>;
+	};
+	qportal32: qman-portal@80000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x80000 0x4000>, <0x1020000 0x1000>;
+		interrupts = <168 0x2 0 0>;
+		cell-index = <0x20>;
+	};
+	qportal33: qman-portal@84000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x84000 0x4000>, <0x1021000 0x1000>;
+		interrupts = <170 0x2 0 0>;
+		cell-index = <0x21>;
+	};
+	qportal34: qman-portal@88000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x88000 0x4000>, <0x1022000 0x1000>;
+		interrupts = <172 0x2 0 0>;
+		cell-index = <0x22>;
+	};
+	qportal35: qman-portal@8c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x8c000 0x4000>, <0x1023000 0x1000>;
+		interrupts = <174 0x2 0 0>;
+		cell-index = <0x23>;
+	};
+	qportal36: qman-portal@90000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x90000 0x4000>, <0x1024000 0x1000>;
+		interrupts = <384 0x2 0 0>;
+		cell-index = <0x24>;
+	};
+	qportal37: qman-portal@94000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x94000 0x4000>, <0x1025000 0x1000>;
+		interrupts = <386 0x2 0 0>;
+		cell-index = <0x25>;
+	};
+	qportal38: qman-portal@98000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x98000 0x4000>, <0x1026000 0x1000>;
+		interrupts = <388 0x2 0 0>;
+		cell-index = <0x26>;
+	};
+	qportal39: qman-portal@9c000 {
+		compatible = "fsl,qman-portal";
+		reg = <0x9c000 0x4000>, <0x1027000 0x1000>;
+		interrupts = <390 0x2 0 0>;
+		cell-index = <0x27>;
+	};
+	qportal40: qman-portal@a0000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xa0000 0x4000>, <0x1028000 0x1000>;
+		interrupts = <392 0x2 0 0>;
+		cell-index = <0x28>;
+	};
+	qportal41: qman-portal@a4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xa4000 0x4000>, <0x1029000 0x1000>;
+		interrupts = <394 0x2 0 0>;
+		cell-index = <0x29>;
+	};
+	qportal42: qman-portal@a8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xa8000 0x4000>, <0x102a000 0x1000>;
+		interrupts = <396 0x2 0 0>;
+		cell-index = <0x2a>;
+	};
+	qportal43: qman-portal@ac000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xac000 0x4000>, <0x102b000 0x1000>;
+		interrupts = <398 0x2 0 0>;
+		cell-index = <0x2b>;
+	};
+	qportal44: qman-portal@b0000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xb0000 0x4000>, <0x102c000 0x1000>;
+		interrupts = <400 0x2 0 0>;
+		cell-index = <0x2c>;
+	};
+	qportal45: qman-portal@b4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xb4000 0x4000>, <0x102d000 0x1000>;
+		interrupts = <402 0x2 0 0>;
+		cell-index = <0x2d>;
+	};
+	qportal46: qman-portal@b8000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xb8000 0x4000>, <0x102e000 0x1000>;
+		interrupts = <404 0x2 0 0>;
+		cell-index = <0x2e>;
+	};
+	qportal47: qman-portal@bc000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xbc000 0x4000>, <0x102f000 0x1000>;
+		interrupts = <406 0x2 0 0>;
+		cell-index = <0x2f>;
+	};
+	qportal48: qman-portal@c0000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc0000 0x4000>, <0x1030000 0x1000>;
+		interrupts = <408 0x2 0 0>;
+		cell-index = <0x30>;
+	};
+	qportal49: qman-portal@c4000 {
+		compatible = "fsl,qman-portal";
+		reg = <0xc4000 0x4000>, <0x1031000 0x1000>;
+		interrupts = <410 0x2 0 0>;
+		cell-index = <0x31>;
+	};
+};
+
 &soc {
 	#address-cells = <1>;
 	#size-cells = <1>;
@@ -748,6 +1065,7 @@
 /include/ "qoriq-sata2-0.dtsi"
 /include/ "qoriq-sata2-1.dtsi"
 /include/ "qoriq-sec5.0-0.dtsi"
+/include/ "qoriq-qman3.dtsi"
 /include/ "qoriq-bman1.dtsi"
 
 	L2_1: l2-cache-controller@c20000 {
diff --git a/arch/powerpc/boot/dts/kmcoge4.dts b/arch/powerpc/boot/dts/kmcoge4.dts
index 97e6d11d1e6d..48dab6a50437 100644
--- a/arch/powerpc/boot/dts/kmcoge4.dts
+++ b/arch/powerpc/boot/dts/kmcoge4.dts
@@ -34,6 +34,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -44,6 +52,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/oca4080.dts b/arch/powerpc/boot/dts/oca4080.dts
index eb76caae11d9..42796c5b0561 100644
--- a/arch/powerpc/boot/dts/oca4080.dts
+++ b/arch/powerpc/boot/dts/oca4080.dts
@@ -58,6 +58,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -68,6 +76,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p1023rdb.dts b/arch/powerpc/boot/dts/p1023rdb.dts
index 9236e3742a23..05a00a4d2861 100644
--- a/arch/powerpc/boot/dts/p1023rdb.dts
+++ b/arch/powerpc/boot/dts/p1023rdb.dts
@@ -56,6 +56,18 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
+	};
+
+	qportals: qman-portals@ff000000 {
+		ranges = <0x0 0xf 0xff000000 0x200000>;
 	};
 
 	bportals: bman-portals@ff200000 {
diff --git a/arch/powerpc/boot/dts/p2041rdb.dts b/arch/powerpc/boot/dts/p2041rdb.dts
index c1e69dc7188e..d2bb0765bd5a 100644
--- a/arch/powerpc/boot/dts/p2041rdb.dts
+++ b/arch/powerpc/boot/dts/p2041rdb.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p3041ds.dts b/arch/powerpc/boot/dts/p3041ds.dts
index 2192fe94866d..eca6c697cfd7 100644
--- a/arch/powerpc/boot/dts/p3041ds.dts
+++ b/arch/powerpc/boot/dts/p3041ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p4080ds.dts b/arch/powerpc/boot/dts/p4080ds.dts
index fad441654642..4f80c9d02c27 100644
--- a/arch/powerpc/boot/dts/p4080ds.dts
+++ b/arch/powerpc/boot/dts/p4080ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p5020ds.dts b/arch/powerpc/boot/dts/p5020ds.dts
index 7382636dc560..d0309a8b9749 100644
--- a/arch/powerpc/boot/dts/p5020ds.dts
+++ b/arch/powerpc/boot/dts/p5020ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/p5040ds.dts b/arch/powerpc/boot/dts/p5040ds.dts
index 35dabf5b6098..05168236d3ab 100644
--- a/arch/powerpc/boot/dts/p5040ds.dts
+++ b/arch/powerpc/boot/dts/p5040ds.dts
@@ -54,6 +54,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -64,6 +72,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x200000>;
 	};
 
+	qportals: qman-portals@ff4200000 {
+		ranges = <0x0 0xf 0xf4200000 0x200000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t1023rdb.dts b/arch/powerpc/boot/dts/t1023rdb.dts
new file mode 100644
index 000000000000..06b090aba066
--- /dev/null
+++ b/arch/powerpc/boot/dts/t1023rdb.dts
@@ -0,0 +1,151 @@
+/*
+ * T1023 RDB Device Tree Source
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "fsl/t102xsi-pre.dtsi"
+
+/ {
+	model = "fsl,T1023RDB";
+	compatible = "fsl,T1023RDB";
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	ifc: localbus@ffe124000 {
+		reg = <0xf 0xfe124000 0 0x2000>;
+		ranges = <0 0 0xf 0xe8000000 0x08000000
+			  1 0 0xf 0xff800000 0x00010000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			status = "disabled";
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+		};
+
+		nand@1,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,ifc-nand";
+			reg = <0x2 0x0 0x10000>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	dcsr: dcsr@f00000000 {
+		ranges = <0x00000000 0xf 0x00000000 0x01072000>;
+	};
+
+	soc: soc@ffe000000 {
+		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
+		reg = <0xf 0xfe000000 0 0x00001000>;
+		spi@110000 {
+			flash@0 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "spansion,s25fl512s";
+				reg = <0>;
+				spi-max-frequency = <10000000>; /* input clk */
+			};
+		};
+
+		i2c@118000 {
+			eeprom@50 {
+				compatible = "st,m24256";
+				reg = <0x50>;
+			};
+
+			rtc@68 {
+				compatible = "dallas,ds1339";
+				reg = <0x68>;
+				interrupts = <0x5 0x1 0 0>;
+			};
+		};
+
+		i2c@118100 {
+		};
+	};
+
+	pci0: pcie@ffe240000 {
+		reg = <0xf 0xfe240000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci1: pcie@ffe250000 {
+		reg = <0xf 0xfe250000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci2: pcie@ffe260000 {
+		reg = <0xf 0xfe260000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+};
+
+/include/ "fsl/t1023si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/t1024qds.dts b/arch/powerpc/boot/dts/t1024qds.dts
new file mode 100644
index 000000000000..f31fabb383b9
--- /dev/null
+++ b/arch/powerpc/boot/dts/t1024qds.dts
@@ -0,0 +1,251 @@
+/*
+ * T1024 QDS Device Tree Source
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "fsl/t102xsi-pre.dtsi"
+
+/ {
+	model = "fsl,T1024QDS";
+	compatible = "fsl,T1024QDS";
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	ifc: localbus@ffe124000 {
+		reg = <0xf 0xfe124000 0 0x2000>;
+		ranges = <0 0 0xf 0xe8000000 0x08000000
+			  2 0 0xf 0xff800000 0x00010000
+			  3 0 0xf 0xffdf0000 0x00008000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+		};
+
+		nand@2,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,ifc-nand";
+			reg = <0x2 0x0 0x10000>;
+		};
+
+		board-control@3,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,tetra-fpga", "fsl,fpga-qixis";
+			reg = <3 0 0x300>;
+			ranges = <0 3 0 0x300>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	dcsr: dcsr@f00000000 {
+		ranges = <0x00000000 0xf 0x00000000 0x01072000>;
+	};
+
+	soc: soc@ffe000000 {
+		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
+		reg = <0xf 0xfe000000 0 0x00001000>;
+		spi@110000 {
+			flash@0 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "micron,n25q128a11";  /* 16MB */
+				reg = <0>;
+				spi-max-frequency = <10000000>;
+			};
+
+			flash@1 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "sst,sst25wf040";  /* 512KB */
+				reg = <1>;
+				spi-max-frequency = <10000000>;
+			};
+
+			flash@2 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "eon,en25s64";   /* 8MB */
+				reg = <2>;
+				spi-max-frequency = <10000000>;
+			};
+
+			slic@2 {
+				compatible = "maxim,ds26522";
+				reg = <2>;
+				spi-max-frequency = <2000000>;
+			};
+
+			slic@3 {
+				compatible = "maxim,ds26522";
+				reg = <3>;
+				spi-max-frequency = <2000000>;
+			};
+		};
+
+		i2c@118000 {
+			pca9547@77 {
+				compatible = "nxp,pca9547";
+				reg = <0x77>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+
+				i2c@0 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					reg = <0x0>;
+
+					eeprom@50 {
+						compatible = "atmel,24c512";
+						reg = <0x50>;
+					};
+
+					eeprom@51 {
+						compatible = "atmel,24c02";
+						reg = <0x51>;
+					};
+
+					eeprom@57 {
+						compatible = "atmel,24c02";
+						reg = <0x57>;
+					};
+				};
+
+				i2c@2 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					reg = <0x2>;
+
+					ina220@40 {
+						compatible = "ti,ina220";
+						reg = <0x40>;
+						shunt-resistor = <1000>;
+					};
+
+					ina220@41 {
+						compatible = "ti,ina220";
+						reg = <0x41>;
+						shunt-resistor = <1000>;
+					};
+				};
+
+				i2c@3 {
+					#address-cells = <1>;
+					#size-cells = <0>;
+					reg = <0x3>;
+
+					adt7461@4c {
+						/* Thermal Monitor */
+						compatible = "adi,adt7461";
+						reg = <0x4c>;
+					};
+
+					eeprom@55 {
+						compatible = "atmel,24c02";
+						reg = <0x55>;
+					};
+
+					eeprom@56 {
+						compatible = "atmel,24c512";
+						reg = <0x56>;
+					};
+
+					eeprom@57 {
+						compatible = "atmel,24c512";
+						reg = <0x57>;
+					};
+				};
+			};
+			rtc@68 {
+				compatible = "dallas,ds3232";
+				reg = <0x68>;
+				interrupts = <0x5 0x1 0 0>;
+			};
+		};
+	};
+
+	pci0: pcie@ffe240000 {
+		reg = <0xf 0xfe240000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci1: pcie@ffe250000 {
+		reg = <0xf 0xfe250000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci2: pcie@ffe260000 {
+		reg = <0xf 0xfe260000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+};
+
+/include/ "fsl/t1024si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/t1024rdb.dts b/arch/powerpc/boot/dts/t1024rdb.dts
new file mode 100644
index 000000000000..733e723ffed6
--- /dev/null
+++ b/arch/powerpc/boot/dts/t1024rdb.dts
@@ -0,0 +1,185 @@
+/*
+ * T1024 RDB Device Tree Source
+ *
+ * Copyright 2014 Freescale Semiconductor Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *	 notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *	 notice, this list of conditions and the following disclaimer in the
+ *	 documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *	 names of its contributors may be used to endorse or promote products
+ *	 derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor "AS IS" AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/include/ "fsl/t102xsi-pre.dtsi"
+
+/ {
+	model = "fsl,T1024RDB";
+	compatible = "fsl,T1024RDB";
+	#address-cells = <2>;
+	#size-cells = <2>;
+	interrupt-parent = <&mpic>;
+
+	ifc: localbus@ffe124000 {
+		reg = <0xf 0xfe124000 0 0x2000>;
+		ranges = <0 0 0xf 0xe8000000 0x08000000
+			  2 0 0xf 0xff800000 0x00010000
+			  3 0 0xf 0xffdf0000 0x00008000>;
+
+		nor@0,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "cfi-flash";
+			reg = <0x0 0x0 0x8000000>;
+			bank-width = <2>;
+			device-width = <1>;
+		};
+
+		nand@1,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,ifc-nand";
+			reg = <0x2 0x0 0x10000>;
+		};
+
+		board-control@2,0 {
+			#address-cells = <1>;
+			#size-cells = <1>;
+			compatible = "fsl,t1024-cpld";
+			reg = <3 0 0x300>;
+			ranges = <0 3 0 0x300>;
+			bank-width = <1>;
+			device-width = <1>;
+		};
+	};
+
+	memory {
+		device_type = "memory";
+	};
+
+	dcsr: dcsr@f00000000 {
+		ranges = <0x00000000 0xf 0x00000000 0x01072000>;
+	};
+
+	soc: soc@ffe000000 {
+		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
+		reg = <0xf 0xfe000000 0 0x00001000>;
+		spi@110000 {
+			flash@0 {
+				#address-cells = <1>;
+				#size-cells = <1>;
+				compatible = "micron,n25q512ax3";
+				reg = <0>;
+				spi-max-frequency = <10000000>; /* input clk */
+			};
+
+			slic@1 {
+				compatible = "maxim,ds26522";
+				reg = <1>;
+				spi-max-frequency = <2000000>;
+			};
+
+			slic@2 {
+				compatible = "maxim,ds26522";
+				reg = <2>;
+				spi-max-frequency = <2000000>;
+			};
+		};
+
+		i2c@118000 {
+			adt7461@4c {
+				/* Thermal Monitor */
+				compatible = "adi,adt7461";
+				reg = <0x4c>;
+			};
+
+			eeprom@50 {
+				compatible = "atmel,24c256";
+				reg = <0x50>;
+			};
+
+			rtc@68 {
+				compatible = "dallas,ds1339";
+				reg = <0x68>;
+				interrupts = <0x1 0x1 0 0>;
+			};
+		};
+
+		i2c@118100 {
+			pca9546@77 {
+				compatible = "nxp,pca9546";
+				reg = <0x77>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+			};
+		};
+	};
+
+	pci0: pcie@ffe240000 {
+		reg = <0xf 0xfe240000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x00000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8000000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci1: pcie@ffe250000 {
+		reg = <0xf 0xfe250000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x10000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8010000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+
+	pci2: pcie@ffe260000 {
+		reg = <0xf 0xfe260000 0 0x10000>;
+		ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x10000000
+			  0x01000000 0 0x00000000 0xf 0xf8020000 0 0x00010000>;
+		pcie@0 {
+			ranges = <0x02000000 0 0xe0000000
+				  0x02000000 0 0xe0000000
+				  0 0x10000000
+
+				  0x01000000 0 0x00000000
+				  0x01000000 0 0x00000000
+				  0 0x00010000>;
+		};
+	};
+};
+
+/include/ "fsl/t1024si-post.dtsi"
diff --git a/arch/powerpc/boot/dts/t104xqds.dtsi b/arch/powerpc/boot/dts/t104xqds.dtsi
index f7e9bfbeefc7..1498d1e4aecf 100644
--- a/arch/powerpc/boot/dts/t104xqds.dtsi
+++ b/arch/powerpc/boot/dts/t104xqds.dtsi
@@ -47,6 +47,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -92,6 +100,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t104xrdb.dtsi b/arch/powerpc/boot/dts/t104xrdb.dtsi
index 76e07a3f2ca8..830ea484295b 100644
--- a/arch/powerpc/boot/dts/t104xrdb.dtsi
+++ b/arch/powerpc/boot/dts/t104xrdb.dtsi
@@ -42,6 +42,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -83,6 +91,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t208xqds.dtsi b/arch/powerpc/boot/dts/t208xqds.dtsi
index c42e07f4f648..869f9159b4d1 100644
--- a/arch/powerpc/boot/dts/t208xqds.dtsi
+++ b/arch/powerpc/boot/dts/t208xqds.dtsi
@@ -48,6 +48,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -93,6 +101,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t208xrdb.dtsi b/arch/powerpc/boot/dts/t208xrdb.dtsi
index e1463b165d0e..693d2a8fa01c 100644
--- a/arch/powerpc/boot/dts/t208xrdb.dtsi
+++ b/arch/powerpc/boot/dts/t208xrdb.dtsi
@@ -48,6 +48,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	ifc: localbus@ffe124000 {
@@ -94,6 +102,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t4240qds.dts b/arch/powerpc/boot/dts/t4240qds.dts
index 6df77766410b..93722da10e16 100644
--- a/arch/powerpc/boot/dts/t4240qds.dts
+++ b/arch/powerpc/boot/dts/t4240qds.dts
@@ -109,6 +109,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -119,6 +127,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/boot/dts/t4240rdb.dts b/arch/powerpc/boot/dts/t4240rdb.dts
index 46049cf37f02..993eb4b8a487 100644
--- a/arch/powerpc/boot/dts/t4240rdb.dts
+++ b/arch/powerpc/boot/dts/t4240rdb.dts
@@ -78,6 +78,14 @@
 			size = <0 0x1000000>;
 			alignment = <0 0x1000000>;
 		};
+		qman_fqd: qman-fqd {
+			size = <0 0x400000>;
+			alignment = <0 0x400000>;
+		};
+		qman_pfdr: qman-pfdr {
+			size = <0 0x2000000>;
+			alignment = <0 0x2000000>;
+		};
 	};
 
 	dcsr: dcsr@f00000000 {
@@ -88,6 +96,10 @@
 		ranges = <0x0 0xf 0xf4000000 0x2000000>;
 	};
 
+	qportals: qman-portals@ff6000000 {
+		ranges = <0x0 0xf 0xf6000000 0x2000000>;
+	};
+
 	soc: soc@ffe000000 {
 		ranges = <0x00000000 0xf 0xfe000000 0x1000000>;
 		reg = <0xf 0xfe000000 0 0x00001000>;
diff --git a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
index 34f3ea1729e0..858b539d004b 100644
--- a/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
+++ b/arch/powerpc/configs/85xx/xes_mpc85xx_defconfig
@@ -108,7 +108,7 @@ CONFIG_SENSORS_LM90=y
 CONFIG_WATCHDOG=y
 CONFIG_USB=y
 CONFIG_USB_MON=y
-CONFIG_USB_ISP1760_HCD=y
+CONFIG_USB_ISP1760=y
 CONFIG_USB_STORAGE=y
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
diff --git a/arch/powerpc/configs/le.config b/arch/powerpc/configs/le.config
new file mode 100644
index 000000000000..ee43fdb3b8f4
--- /dev/null
+++ b/arch/powerpc/configs/le.config
@@ -0,0 +1 @@
+CONFIG_CPU_LITTLE_ENDIAN=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index aad501ae3834..a97efc2146fd 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -155,6 +155,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index c2e39f66b182..0d9efcedaf34 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -154,6 +154,7 @@ CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
 CONFIG_PCNET32=y
 CONFIG_TIGON3=y
+CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
 CONFIG_BE2NET=m
 CONFIG_S2IO=m
@@ -297,7 +298,6 @@ CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
 CONFIG_XMON=y
-CONFIG_XMON_DEFAULT=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig
deleted file mode 100644
index 09bc96e792cd..000000000000
--- a/arch/powerpc/configs/pseries_le_defconfig
+++ /dev/null
@@ -1,319 +0,0 @@
-CONFIG_PPC64=y
-CONFIG_SMP=y
-CONFIG_NR_CPUS=2048
-CONFIG_CPU_LITTLE_ENDIAN=y
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_FHANDLE=y
-CONFIG_AUDIT=y
-CONFIG_AUDITSYSCALL=y
-CONFIG_IRQ_DOMAIN_DEBUG=y
-CONFIG_NO_HZ=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_XACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_NUMA_BALANCING=y
-CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_DEVICE=y
-CONFIG_CPUSETS=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_MEMCG=y
-CONFIG_MEMCG_SWAP=y
-CONFIG_CGROUP_PERF=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_USER_NS=y
-CONFIG_BLK_DEV_INITRD=y
-# CONFIG_COMPAT_BRK is not set
-CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
-CONFIG_KPROBES=y
-CONFIG_JUMP_LABEL=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_MODULE_SRCVERSION_ALL=y
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_PPC_SPLPAR=y
-CONFIG_SCANLOG=m
-CONFIG_PPC_SMLPAR=y
-CONFIG_DTL=y
-# CONFIG_PPC_PMAC is not set
-CONFIG_RTAS_FLASH=m
-CONFIG_IBMEBUS=y
-CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
-CONFIG_HZ_100=y
-CONFIG_BINFMT_MISC=m
-CONFIG_PPC_TRANSACTIONAL_MEM=y
-CONFIG_KEXEC=y
-CONFIG_IRQ_ALL_CPUS=y
-CONFIG_MEMORY_HOTPLUG=y
-CONFIG_MEMORY_HOTREMOVE=y
-CONFIG_KSM=y
-CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_PPC_64K_PAGES=y
-CONFIG_PPC_SUBPAGE_PROT=y
-CONFIG_SCHED_SMT=y
-CONFIG_HOTPLUG_PCI=y
-CONFIG_HOTPLUG_PCI_RPA=m
-CONFIG_HOTPLUG_PCI_RPA_DLPAR=m
-CONFIG_NET=y
-CONFIG_PACKET=y
-CONFIG_UNIX=y
-CONFIG_XFRM_USER=m
-CONFIG_NET_KEY=m
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_NET_IPIP=y
-CONFIG_SYN_COOKIES=y
-CONFIG_INET_AH=m
-CONFIG_INET_ESP=m
-CONFIG_INET_IPCOMP=m
-# CONFIG_IPV6 is not set
-CONFIG_NETFILTER=y
-# CONFIG_NETFILTER_ADVANCED is not set
-CONFIG_BRIDGE=m
-CONFIG_VLAN_8021Q=m
-CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
-CONFIG_DEVTMPFS=y
-CONFIG_DEVTMPFS_MOUNT=y
-CONFIG_PARPORT=m
-CONFIG_PARPORT_PC=m
-CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_NBD=m
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=65536
-CONFIG_VIRTIO_BLK=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_GENERIC=y
-CONFIG_BLK_DEV_AMD74XX=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
-CONFIG_BLK_DEV_SR=y
-CONFIG_BLK_DEV_SR_VENDOR=y
-CONFIG_CHR_DEV_SG=y
-CONFIG_SCSI_CONSTANTS=y
-CONFIG_SCSI_FC_ATTRS=y
-CONFIG_SCSI_CXGB3_ISCSI=m
-CONFIG_SCSI_CXGB4_ISCSI=m
-CONFIG_SCSI_BNX2_ISCSI=m
-CONFIG_BE2ISCSI=m
-CONFIG_SCSI_MPT2SAS=m
-CONFIG_SCSI_IBMVSCSI=y
-CONFIG_SCSI_IBMVFC=m
-CONFIG_SCSI_SYM53C8XX_2=y
-CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
-CONFIG_SCSI_IPR=y
-CONFIG_SCSI_QLA_FC=m
-CONFIG_SCSI_QLA_ISCSI=m
-CONFIG_SCSI_LPFC=m
-CONFIG_SCSI_VIRTIO=m
-CONFIG_SCSI_DH=m
-CONFIG_SCSI_DH_RDAC=m
-CONFIG_SCSI_DH_ALUA=m
-CONFIG_ATA=y
-CONFIG_SATA_AHCI=y
-# CONFIG_ATA_SFF is not set
-CONFIG_MD=y
-CONFIG_BLK_DEV_MD=y
-CONFIG_MD_LINEAR=y
-CONFIG_MD_RAID0=y
-CONFIG_MD_RAID1=y
-CONFIG_MD_RAID10=m
-CONFIG_MD_RAID456=m
-CONFIG_MD_MULTIPATH=m
-CONFIG_MD_FAULTY=m
-CONFIG_BLK_DEV_DM=y
-CONFIG_DM_CRYPT=m
-CONFIG_DM_SNAPSHOT=m
-CONFIG_DM_THIN_PROVISIONING=m
-CONFIG_DM_MIRROR=m
-CONFIG_DM_ZERO=m
-CONFIG_DM_MULTIPATH=m
-CONFIG_DM_MULTIPATH_QL=m
-CONFIG_DM_MULTIPATH_ST=m
-CONFIG_DM_UEVENT=y
-CONFIG_BONDING=m
-CONFIG_DUMMY=m
-CONFIG_MACVLAN=m
-CONFIG_MACVTAP=m
-CONFIG_VXLAN=m
-CONFIG_NETCONSOLE=y
-CONFIG_TUN=m
-CONFIG_VETH=m
-CONFIG_VIRTIO_NET=m
-CONFIG_VHOST_NET=m
-CONFIG_VORTEX=y
-CONFIG_ACENIC=m
-CONFIG_ACENIC_OMIT_TIGON_I=y
-CONFIG_PCNET32=y
-CONFIG_TIGON3=y
-CONFIG_CHELSIO_T1=m
-CONFIG_BE2NET=m
-CONFIG_S2IO=m
-CONFIG_IBMVETH=y
-CONFIG_EHEA=y
-CONFIG_E100=y
-CONFIG_E1000=y
-CONFIG_E1000E=y
-CONFIG_IXGB=m
-CONFIG_IXGBE=m
-CONFIG_MLX4_EN=m
-CONFIG_MYRI10GE=m
-CONFIG_QLGE=m
-CONFIG_NETXEN_NIC=m
-CONFIG_PPP=m
-CONFIG_PPP_BSDCOMP=m
-CONFIG_PPP_DEFLATE=m
-CONFIG_PPPOE=m
-CONFIG_PPP_ASYNC=m
-CONFIG_PPP_SYNC_TTY=m
-# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
-CONFIG_INPUT_EVDEV=m
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_PCSPKR=m
-# CONFIG_SERIO_SERPORT is not set
-CONFIG_DEVPTS_MULTIPLE_INSTANCES=y
-CONFIG_SERIAL_8250=y
-CONFIG_SERIAL_8250_CONSOLE=y
-CONFIG_SERIAL_ICOM=m
-CONFIG_SERIAL_JSM=m
-CONFIG_HVC_CONSOLE=y
-CONFIG_HVC_RTAS=y
-CONFIG_HVCS=m
-CONFIG_VIRTIO_CONSOLE=m
-CONFIG_IBM_BSR=m
-CONFIG_GEN_RTC=y
-CONFIG_RAW_DRIVER=y
-CONFIG_MAX_RAW_DEVS=1024
-CONFIG_FB=y
-CONFIG_FIRMWARE_EDID=y
-CONFIG_FB_OF=y
-CONFIG_FB_MATROX=y
-CONFIG_FB_MATROX_MILLENIUM=y
-CONFIG_FB_MATROX_MYSTIQUE=y
-CONFIG_FB_MATROX_G=y
-CONFIG_FB_RADEON=y
-CONFIG_FB_IBM_GXT4500=y
-CONFIG_LCD_PLATFORM=m
-# CONFIG_VGA_CONSOLE is not set
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_LOGO=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB=y
-CONFIG_USB_MON=m
-CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_HCD_PPC_OF is not set
-CONFIG_USB_OHCI_HCD=y
-CONFIG_USB_STORAGE=m
-CONFIG_INFINIBAND=m
-CONFIG_INFINIBAND_USER_MAD=m
-CONFIG_INFINIBAND_USER_ACCESS=m
-CONFIG_INFINIBAND_MTHCA=m
-CONFIG_INFINIBAND_EHCA=m
-CONFIG_INFINIBAND_CXGB3=m
-CONFIG_INFINIBAND_CXGB4=m
-CONFIG_MLX4_INFINIBAND=m
-CONFIG_INFINIBAND_IPOIB=m
-CONFIG_INFINIBAND_IPOIB_CM=y
-CONFIG_INFINIBAND_SRP=m
-CONFIG_INFINIBAND_ISER=m
-CONFIG_VIRTIO_PCI=m
-CONFIG_VIRTIO_BALLOON=m
-CONFIG_EXT2_FS=y
-CONFIG_EXT2_FS_XATTR=y
-CONFIG_EXT2_FS_POSIX_ACL=y
-CONFIG_EXT2_FS_SECURITY=y
-CONFIG_EXT2_FS_XIP=y
-CONFIG_EXT3_FS=y
-CONFIG_EXT3_FS_POSIX_ACL=y
-CONFIG_EXT3_FS_SECURITY=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_POSIX_ACL=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
-CONFIG_REISERFS_FS_XATTR=y
-CONFIG_REISERFS_FS_POSIX_ACL=y
-CONFIG_REISERFS_FS_SECURITY=y
-CONFIG_JFS_FS=m
-CONFIG_JFS_POSIX_ACL=y
-CONFIG_JFS_SECURITY=y
-CONFIG_XFS_FS=m
-CONFIG_XFS_POSIX_ACL=y
-CONFIG_BTRFS_FS=m
-CONFIG_BTRFS_FS_POSIX_ACL=y
-CONFIG_NILFS2_FS=m
-CONFIG_AUTOFS4_FS=m
-CONFIG_FUSE_FS=m
-CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=y
-CONFIG_UDF_FS=m
-CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
-CONFIG_PROC_KCORE=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_HUGETLBFS=y
-CONFIG_CRAMFS=m
-CONFIG_SQUASHFS=m
-CONFIG_SQUASHFS_XATTR=y
-CONFIG_SQUASHFS_LZO=y
-CONFIG_SQUASHFS_XZ=y
-CONFIG_PSTORE=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V3_ACL=y
-CONFIG_NFS_V4=y
-CONFIG_NFSD=m
-CONFIG_NFSD_V3_ACL=y
-CONFIG_NFSD_V4=y
-CONFIG_CIFS=m
-CONFIG_CIFS_XATTR=y
-CONFIG_CIFS_POSIX=y
-CONFIG_NLS_DEFAULT="utf8"
-CONFIG_NLS_CODEPAGE_437=y
-CONFIG_NLS_ASCII=y
-CONFIG_NLS_ISO8859_1=y
-CONFIG_NLS_UTF8=y
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_DEBUG_STACK_USAGE=y
-CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
-CONFIG_LATENCYTOP=y
-CONFIG_SCHED_TRACER=y
-CONFIG_BLK_DEV_IO_TRACE=y
-CONFIG_CODE_PATCHING_SELFTEST=y
-CONFIG_FTR_FIXUP_SELFTEST=y
-CONFIG_MSI_BITMAP_SELFTEST=y
-CONFIG_XMON=y
-CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_WP512=m
-CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_CAST6=m
-CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_SALSA20=m
-CONFIG_CRYPTO_SERPENT=m
-CONFIG_CRYPTO_TEA=m
-CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_LZO=m
-# CONFIG_CRYPTO_ANSI_CPRNG is not set
-CONFIG_VIRTUALIZATION=y
-CONFIG_KVM_BOOK3S_64=m
-CONFIG_KVM_BOOK3S_64_HV=m
diff --git a/arch/powerpc/crypto/md5-glue.c b/arch/powerpc/crypto/md5-glue.c
index 452fb4dc575f..92289679b4c4 100644
--- a/arch/powerpc/crypto/md5-glue.c
+++ b/arch/powerpc/crypto/md5-glue.c
@@ -37,10 +37,10 @@ static int ppc_md5_init(struct shash_desc *desc)
 {
 	struct md5_state *sctx = shash_desc_ctx(desc);
 
-	sctx->hash[0] = 0x67452301;
-	sctx->hash[1] = 0xefcdab89;
-	sctx->hash[2] = 0x98badcfe;
-	sctx->hash[3] =	0x10325476;
+	sctx->hash[0] = MD5_H0;
+	sctx->hash[1] = MD5_H1;
+	sctx->hash[2] = MD5_H2;
+	sctx->hash[3] =	MD5_H3;
 	sctx->byte_count = 0;
 
 	return 0;
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index a3bf5be111ff..51ccc7232042 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
 
-#define set_mb(var, value)	do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)	do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
@@ -89,5 +89,6 @@ do {									\
 
 #define smp_mb__before_atomic()     smp_mb()
 #define smp_mb__after_atomic()      smp_mb()
+#define smp_mb__before_spinlock()   smp_mb()
 
 #endif /* _ASM_POWERPC_BARRIER_H */
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index d463c68fe7f0..ad6263cffb0f 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -144,7 +144,6 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
  * Compare and exchange - if *p == old, set it to new,
  * and return the old value of *p.
  */
-#define __HAVE_ARCH_CMPXCHG	1
 
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 6367b8347dad..b118072670fb 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -242,11 +242,13 @@ enum {
 
 /* We only set the TM feature if the kernel was compiled with TM supprt */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-#define CPU_FTR_TM_COMP		CPU_FTR_TM
-#define PPC_FEATURE2_HTM_COMP	PPC_FEATURE2_HTM
+#define CPU_FTR_TM_COMP			CPU_FTR_TM
+#define PPC_FEATURE2_HTM_COMP		PPC_FEATURE2_HTM
+#define PPC_FEATURE2_HTM_NOSC_COMP	PPC_FEATURE2_HTM_NOSC
 #else
-#define CPU_FTR_TM_COMP		0
-#define PPC_FEATURE2_HTM_COMP	0
+#define CPU_FTR_TM_COMP			0
+#define PPC_FEATURE2_HTM_COMP		0
+#define PPC_FEATURE2_HTM_NOSC_COMP	0
 #endif
 
 /* We need to mark all pages as being coherent if we're SMP or we have a
@@ -366,7 +368,7 @@ enum {
 	    CPU_FTR_USE_TB | CPU_FTR_MAYBE_CAN_NAP | \
 	    CPU_FTR_COMMON | CPU_FTR_FPU_UNAVAILABLE)
 #define CPU_FTRS_CLASSIC32	(CPU_FTR_COMMON | CPU_FTR_USE_TB)
-#define CPU_FTRS_8XX	(CPU_FTR_USE_TB)
+#define CPU_FTRS_8XX	(CPU_FTR_USE_TB | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_40X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_44X	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_440x6	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE | \
diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h
index 5be6c4753667..ba42e46ea58e 100644
--- a/arch/powerpc/include/asm/cputhreads.h
+++ b/arch/powerpc/include/asm/cputhreads.h
@@ -31,9 +31,9 @@ extern cpumask_t threads_core_mask;
 /* cpu_thread_mask_to_cores - Return a cpumask of one per cores
  *                            hit by the argument
  *
- * @threads:	a cpumask of threads
+ * @threads:	a cpumask of online threads
  *
- * This function returns a cpumask which will have one "cpu" (or thread)
+ * This function returns a cpumask which will have one online cpu's
  * bit set for each core that has at least one thread set in the argument.
  *
  * This can typically be used for things like IPI for tlb invalidations
@@ -42,13 +42,16 @@ extern cpumask_t threads_core_mask;
 static inline cpumask_t cpu_thread_mask_to_cores(const struct cpumask *threads)
 {
 	cpumask_t	tmp, res;
-	int		i;
+	int		i, cpu;
 
 	cpumask_clear(&res);
 	for (i = 0; i < NR_CPUS; i += threads_per_core) {
 		cpumask_shift_left(&tmp, &threads_core_mask, i);
-		if (cpumask_intersects(threads, &tmp))
-			cpumask_set_cpu(i, &res);
+		if (cpumask_intersects(threads, &tmp)) {
+			cpu = cpumask_next_and(-1, &tmp, cpu_online_mask);
+			if (cpu < nr_cpu_ids)
+				cpumask_set_cpu(cpu, &res);
+		}
 	}
 	return res;
 }
diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index 9f1371bab5fc..e9bdda88f1fb 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -46,6 +46,9 @@ struct dev_archdata {
 #ifdef CONFIG_FAIL_IOMMU
 	int fail_iommu;
 #endif
+#ifdef CONFIG_CXL_BASE
+	struct cxl_context	*cxl_ctx;
+#endif
 };
 
 struct pdev_archdata {
diff --git a/arch/powerpc/include/asm/edac.h b/arch/powerpc/include/asm/edac.h
index 6ead88bbfbb8..5571e23d253e 100644
--- a/arch/powerpc/include/asm/edac.h
+++ b/arch/powerpc/include/asm/edac.h
@@ -12,11 +12,11 @@
 #define ASM_EDAC_H
 /*
  * ECC atomic, DMA, SMP and interrupt safe scrub function.
- * Implements the per arch atomic_scrub() that EDAC use for software
+ * Implements the per arch edac_atomic_scrub() that EDAC use for software
  * ECC scrubbing.  It reads memory and then writes back the original
  * value, allowing the hardware to detect and correct memory errors.
  */
-static __inline__ void atomic_scrub(void *va, u32 size)
+static __inline__ void edac_atomic_scrub(void *va, u32 size)
 {
 	unsigned int *virt_addr = va;
 	unsigned int temp;
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index a52db28ecc1e..c5eb86f3d452 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -27,6 +27,8 @@
 #include <linux/time.h>
 #include <linux/atomic.h>
 
+#include <uapi/asm/eeh.h>
+
 struct pci_dev;
 struct pci_bus;
 struct pci_dn;
@@ -185,11 +187,6 @@ enum {
 #define EEH_STATE_DMA_ACTIVE	(1 << 4)	/* Active DMA		*/
 #define EEH_STATE_MMIO_ENABLED	(1 << 5)	/* MMIO enabled		*/
 #define EEH_STATE_DMA_ENABLED	(1 << 6)	/* DMA enabled		*/
-#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
-#define EEH_PE_STATE_RESET		1	/* PE reset asserted	*/
-#define EEH_PE_STATE_STOPPED_IO_DMA	2	/* Frozen PE		*/
-#define EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA, Enabled IO */
-#define EEH_PE_STATE_UNAVAIL		5	/* Unavailable		*/
 #define EEH_RESET_DEACTIVATE	0	/* Deactivate the PE reset	*/
 #define EEH_RESET_HOT		1	/* Hot reset			*/
 #define EEH_RESET_FUNDAMENTAL	3	/* Fundamental reset		*/
@@ -294,6 +291,8 @@ int eeh_pe_set_option(struct eeh_pe *pe, int option);
 int eeh_pe_get_state(struct eeh_pe *pe);
 int eeh_pe_reset(struct eeh_pe *pe, int option);
 int eeh_pe_configure(struct eeh_pe *pe);
+int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
+		      unsigned long addr, unsigned long mask);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/include/asm/icswx.h b/arch/powerpc/include/asm/icswx.h
new file mode 100644
index 000000000000..9f8402b35115
--- /dev/null
+++ b/arch/powerpc/include/asm/icswx.h
@@ -0,0 +1,184 @@
+/*
+ * ICSWX api
+ *
+ * Copyright (C) 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This provides the Initiate Coprocessor Store Word Indexed (ICSWX)
+ * instruction.  This instruction is used to communicate with PowerPC
+ * coprocessors.  This also provides definitions of the structures used
+ * to communicate with the coprocessor.
+ *
+ * The RFC02130: Coprocessor Architecture document is the reference for
+ * everything in this file unless otherwise noted.
+ */
+#ifndef _ARCH_POWERPC_INCLUDE_ASM_ICSWX_H_
+#define _ARCH_POWERPC_INCLUDE_ASM_ICSWX_H_
+
+#include <asm/ppc-opcode.h> /* for PPC_ICSWX */
+
+/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */
+
+#define CCB_VALUE		(0x3fffffffffffffff)
+#define CCB_ADDRESS		(0xfffffffffffffff8)
+#define CCB_CM			(0x0000000000000007)
+#define CCB_CM0			(0x0000000000000004)
+#define CCB_CM12		(0x0000000000000003)
+
+#define CCB_CM0_ALL_COMPLETIONS	(0x0)
+#define CCB_CM0_LAST_IN_CHAIN	(0x4)
+#define CCB_CM12_STORE		(0x0)
+#define CCB_CM12_INTERRUPT	(0x1)
+
+#define CCB_SIZE		(0x10)
+#define CCB_ALIGN		CCB_SIZE
+
+struct coprocessor_completion_block {
+	__be64 value;
+	__be64 address;
+} __packed __aligned(CCB_ALIGN);
+
+
+/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */
+
+#define CSB_V			(0x80)
+#define CSB_F			(0x04)
+#define CSB_CH			(0x03)
+#define CSB_CE_INCOMPLETE	(0x80)
+#define CSB_CE_TERMINATION	(0x40)
+#define CSB_CE_TPBC		(0x20)
+
+#define CSB_CC_SUCCESS		(0)
+#define CSB_CC_INVALID_ALIGN	(1)
+#define CSB_CC_OPERAND_OVERLAP	(2)
+#define CSB_CC_DATA_LENGTH	(3)
+#define CSB_CC_TRANSLATION	(5)
+#define CSB_CC_PROTECTION	(6)
+#define CSB_CC_RD_EXTERNAL	(7)
+#define CSB_CC_INVALID_OPERAND	(8)
+#define CSB_CC_PRIVILEGE	(9)
+#define CSB_CC_INTERNAL		(10)
+#define CSB_CC_WR_EXTERNAL	(12)
+#define CSB_CC_NOSPC		(13)
+#define CSB_CC_EXCESSIVE_DDE	(14)
+#define CSB_CC_WR_TRANSLATION	(15)
+#define CSB_CC_WR_PROTECTION	(16)
+#define CSB_CC_UNKNOWN_CODE	(17)
+#define CSB_CC_ABORT		(18)
+#define CSB_CC_TRANSPORT	(20)
+#define CSB_CC_SEGMENTED_DDL	(31)
+#define CSB_CC_PROGRESS_POINT	(32)
+#define CSB_CC_DDE_OVERFLOW	(33)
+#define CSB_CC_SESSION		(34)
+#define CSB_CC_PROVISION	(36)
+#define CSB_CC_CHAIN		(37)
+#define CSB_CC_SEQUENCE		(38)
+#define CSB_CC_HW		(39)
+
+#define CSB_SIZE		(0x10)
+#define CSB_ALIGN		CSB_SIZE
+
+struct coprocessor_status_block {
+	u8 flags;
+	u8 cs;
+	u8 cc;
+	u8 ce;
+	__be32 count;
+	__be64 address;
+} __packed __aligned(CSB_ALIGN);
+
+
+/* Chapter 6.5.10 Data-Descriptor List (DDL)
+ * each list contains one or more Data-Descriptor Entries (DDE)
+ */
+
+#define DDE_P			(0x8000)
+
+#define DDE_SIZE		(0x10)
+#define DDE_ALIGN		DDE_SIZE
+
+struct data_descriptor_entry {
+	__be16 flags;
+	u8 count;
+	u8 index;
+	__be32 length;
+	__be64 address;
+} __packed __aligned(DDE_ALIGN);
+
+
+/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */
+
+#define CRB_SIZE		(0x80)
+#define CRB_ALIGN		(0x100) /* Errata: requires 256 alignment */
+
+/* Coprocessor Status Block field
+ *   ADDRESS	address of CSB
+ *   C		CCB is valid
+ *   AT		0 = addrs are virtual, 1 = addrs are phys
+ *   M		enable perf monitor
+ */
+#define CRB_CSB_ADDRESS		(0xfffffffffffffff0)
+#define CRB_CSB_C		(0x0000000000000008)
+#define CRB_CSB_AT		(0x0000000000000002)
+#define CRB_CSB_M		(0x0000000000000001)
+
+struct coprocessor_request_block {
+	__be32 ccw;
+	__be32 flags;
+	__be64 csb_addr;
+
+	struct data_descriptor_entry source;
+	struct data_descriptor_entry target;
+
+	struct coprocessor_completion_block ccb;
+
+	u8 reserved[48];
+
+	struct coprocessor_status_block csb;
+} __packed __aligned(CRB_ALIGN);
+
+
+/* RFC02167 Initiate Coprocessor Instructions document
+ * Chapter 8.2.1.1.1 RS
+ * Chapter 8.2.3 Coprocessor Directive
+ * Chapter 8.2.4 Execution
+ *
+ * The CCW must be converted to BE before passing to icswx()
+ */
+
+#define CCW_PS			(0xff000000)
+#define CCW_CT			(0x00ff0000)
+#define CCW_CD			(0x0000ffff)
+#define CCW_CL			(0x0000c000)
+
+
+/* RFC02167 Initiate Coprocessor Instructions document
+ * Chapter 8.2.1 Initiate Coprocessor Store Word Indexed (ICSWX)
+ * Chapter 8.2.4.1 Condition Register 0
+ */
+
+#define ICSWX_INITIATED		(0x8)
+#define ICSWX_BUSY		(0x4)
+#define ICSWX_REJECTED		(0x2)
+
+static inline int icswx(__be32 ccw, struct coprocessor_request_block *crb)
+{
+	__be64 ccw_reg = ccw;
+	u32 cr;
+
+	__asm__ __volatile__(
+	PPC_ICSWX(%1,0,%2) "\n"
+	"mfcr %0\n"
+	: "=r" (cr)
+	: "r" (ccw_reg), "r" (crb)
+	: "cr0", "memory");
+
+	return (int)((cr >> 28) & 0xf);
+}
+
+
+#endif /* _ARCH_POWERPC_INCLUDE_ASM_ICSWX_H_ */
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 1e27d6338565..ca18cff90900 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -44,6 +44,39 @@
 extern int iommu_is_off;
 extern int iommu_force_on;
 
+struct iommu_table_ops {
+	/*
+	 * When called with direction==DMA_NONE, it is equal to clear().
+	 * uaddr is a linear map address.
+	 */
+	int (*set)(struct iommu_table *tbl,
+			long index, long npages,
+			unsigned long uaddr,
+			enum dma_data_direction direction,
+			struct dma_attrs *attrs);
+#ifdef CONFIG_IOMMU_API
+	/*
+	 * Exchanges existing TCE with new TCE plus direction bits;
+	 * returns old TCE and DMA direction mask.
+	 * @tce is a physical address.
+	 */
+	int (*exchange)(struct iommu_table *tbl,
+			long index,
+			unsigned long *hpa,
+			enum dma_data_direction *direction);
+#endif
+	void (*clear)(struct iommu_table *tbl,
+			long index, long npages);
+	/* get() returns a physical address */
+	unsigned long (*get)(struct iommu_table *tbl, long index);
+	void (*flush)(struct iommu_table *tbl);
+	void (*free)(struct iommu_table *tbl);
+};
+
+/* These are used by VIO */
+extern struct iommu_table_ops iommu_table_lpar_multi_ops;
+extern struct iommu_table_ops iommu_table_pseries_ops;
+
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -64,6 +97,9 @@ struct iommu_pool {
 struct iommu_table {
 	unsigned long  it_busno;     /* Bus number this table belongs to */
 	unsigned long  it_size;      /* Size of iommu table in entries */
+	unsigned long  it_indirect_levels;
+	unsigned long  it_level_size;
+	unsigned long  it_allocated_size;
 	unsigned long  it_offset;    /* Offset into global table */
 	unsigned long  it_base;      /* mapped address of tce table */
 	unsigned long  it_index;     /* which iommu table this is */
@@ -75,15 +111,16 @@ struct iommu_table {
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
-#ifdef CONFIG_IOMMU_API
-	struct iommu_group *it_group;
-#endif
-	void (*set_bypass)(struct iommu_table *tbl, bool enable);
-#ifdef CONFIG_PPC_POWERNV
-	void           *data;
-#endif
+	struct list_head it_group_list;/* List of iommu_table_group_link */
+	unsigned long *it_userspace; /* userspace view of the table */
+	struct iommu_table_ops *it_ops;
 };
 
+#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
+		((tbl)->it_userspace ? \
+			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
+			NULL)
+
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
 int get_iommu_order(unsigned long size, struct iommu_table *tbl)
@@ -112,14 +149,62 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
+#define IOMMU_TABLE_GROUP_MAX_TABLES	2
+
+struct iommu_table_group;
+
+struct iommu_table_group_ops {
+	unsigned long (*get_table_size)(
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels);
+	long (*create_table)(struct iommu_table_group *table_group,
+			int num,
+			__u32 page_shift,
+			__u64 window_size,
+			__u32 levels,
+			struct iommu_table **ptbl);
+	long (*set_window)(struct iommu_table_group *table_group,
+			int num,
+			struct iommu_table *tblnew);
+	long (*unset_window)(struct iommu_table_group *table_group,
+			int num);
+	/* Switch ownership from platform code to external user (e.g. VFIO) */
+	void (*take_ownership)(struct iommu_table_group *table_group);
+	/* Switch ownership from external user (e.g. VFIO) back to core */
+	void (*release_ownership)(struct iommu_table_group *table_group);
+};
+
+struct iommu_table_group_link {
+	struct list_head next;
+	struct rcu_head rcu;
+	struct iommu_table_group *table_group;
+};
+
+struct iommu_table_group {
+	/* IOMMU properties */
+	__u32 tce32_start;
+	__u32 tce32_size;
+	__u64 pgsizes; /* Bitmap of supported page sizes */
+	__u32 max_dynamic_windows_supported;
+	__u32 max_levels;
+
+	struct iommu_group *group;
+	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+	struct iommu_table_group_ops *ops;
+};
+
 #ifdef CONFIG_IOMMU_API
-extern void iommu_register_group(struct iommu_table *tbl,
+
+extern void iommu_register_group(struct iommu_table_group *table_group,
 				 int pci_domain_number, unsigned long pe_num);
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
+extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction);
 #else
-static inline void iommu_register_group(struct iommu_table *tbl,
+static inline void iommu_register_group(struct iommu_table_group *table_group,
 					int pci_domain_number,
 					unsigned long pe_num)
 {
@@ -140,13 +225,6 @@ static inline int __init tce_iommu_bus_notifier_init(void)
 }
 #endif /* !CONFIG_IOMMU_API */
 
-static inline void set_iommu_table_base_and_group(struct device *dev,
-						  void *base)
-{
-	set_iommu_table_base(dev, base);
-	iommu_add_device(dev);
-}
-
 extern int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    struct scatterlist *sglist, int nelems,
 			    unsigned long mask,
@@ -197,20 +275,13 @@ extern int iommu_tce_clear_param_check(struct iommu_table *tbl,
 		unsigned long npages);
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce);
-extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-		unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-		unsigned long entry);
-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-		unsigned long entry, unsigned long pages);
-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
-		unsigned long entry, unsigned long tce);
 
 extern void iommu_flush_tce(struct iommu_table *tbl);
 extern int iommu_take_ownership(struct iommu_table *tbl);
 extern void iommu_release_ownership(struct iommu_table *tbl);
 
 extern enum dma_data_direction iommu_tce_direction(unsigned long tce);
+extern unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 3536d12eb798..2aa79c864e91 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -430,7 +430,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-	return rcu_dereference_raw_notrace(kvm->memslots);
+	return rcu_dereference_raw_notrace(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a193a13cf08b..d91f65b28e32 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -698,7 +698,7 @@ struct kvm_vcpu_arch {
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b8475daad884..c6ef05bd0765 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -182,10 +182,11 @@ extern int kvmppc_core_create_memslot(struct kvm *kvm,
 				      unsigned long npages);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem);
+				const struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old);
+				const struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
 extern void kvmppc_core_flush_memslot(struct kvm *kvm,
@@ -243,10 +244,11 @@ struct kvmppc_ops {
 	void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot);
 	int (*prepare_memory_region)(struct kvm *kvm,
 				     struct kvm_memory_slot *memslot,
-				     struct kvm_userspace_memory_region *mem);
+				     const struct kvm_userspace_memory_region *mem);
 	void (*commit_memory_region)(struct kvm *kvm,
-				     struct kvm_userspace_memory_region *mem,
-				     const struct kvm_memory_slot *old);
+				     const struct kvm_userspace_memory_region *mem,
+				     const struct kvm_memory_slot *old,
+				     const struct kvm_memory_slot *new);
 	int (*unmap_hva)(struct kvm *kvm, unsigned long hva);
 	int (*unmap_hva_range)(struct kvm *kvm, unsigned long start,
 			   unsigned long end);
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index ef8899432ae7..952579f5e79a 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -65,31 +65,6 @@ struct machdep_calls {
 	 * destroyed as well */
 	void		(*hpte_clear_all)(void);
 
-	int		(*tce_build)(struct iommu_table *tbl,
-				     long index,
-				     long npages,
-				     unsigned long uaddr,
-				     enum dma_data_direction direction,
-				     struct dma_attrs *attrs);
-	void		(*tce_free)(struct iommu_table *tbl,
-				    long index,
-				    long npages);
-	unsigned long	(*tce_get)(struct iommu_table *tbl,
-				    long index);
-	void		(*tce_flush)(struct iommu_table *tbl);
-
-	/* _rm versions are for real mode use only */
-	int		(*tce_build_rm)(struct iommu_table *tbl,
-				     long index,
-				     long npages,
-				     unsigned long uaddr,
-				     enum dma_data_direction direction,
-				     struct dma_attrs *attrs);
-	void		(*tce_free_rm)(struct iommu_table *tbl,
-				    long index,
-				    long npages);
-	void		(*tce_flush_rm)(struct iommu_table *tbl);
-
 	void __iomem *	(*ioremap)(phys_addr_t addr, unsigned long size,
 				   unsigned long flags, void *caller);
 	void		(*iounmap)(volatile void __iomem *token);
@@ -131,12 +106,6 @@ struct machdep_calls {
 	/* To setup PHBs when using automatic OF platform driver for PCI */
 	int		(*pci_setup_phb)(struct pci_controller *host);
 
-#ifdef CONFIG_PCI_MSI
-	int		(*setup_msi_irqs)(struct pci_dev *dev,
-					  int nvec, int type);
-	void		(*teardown_msi_irqs)(struct pci_dev *dev);
-#endif
-
 	void		(*restart)(char *cmd);
 	void		(*halt)(void);
 	void		(*panic)(char *str);
diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h
index 986b9e1e1044..f05500a29a60 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -27,6 +27,19 @@
 #define MI_Ks		0x80000000	/* Should not be set */
 #define MI_Kp		0x40000000	/* Should always be set */
 
+/*
+ * All pages' PP exec bits are set to 000, which means Execute for Supervisor
+ * and no Execute for User.
+ * Then we use the APG to say whether accesses are according to Page rules,
+ * "all Supervisor" rules (Exec for all) and "all User" rules (Exec for noone)
+ * Therefore, we define 4 APG groups. msb is _PAGE_EXEC, lsb is _PAGE_USER
+ * 0 (00) => Not User, no exec => 11 (all accesses performed as user)
+ * 1 (01) => User but no exec => 11 (all accesses performed as user)
+ * 2 (10) => Not User, exec => 01 (rights according to page definition)
+ * 3 (11) => User, exec => 00 (all accesses performed as supervisor)
+ */
+#define MI_APG_INIT	0xf4ffffff
+
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
  * this register are used to create the TLB entry.
@@ -87,6 +100,19 @@
 #define MD_Ks		0x80000000	/* Should not be set */
 #define MD_Kp		0x40000000	/* Should always be set */
 
+/*
+ * All pages' PP data bits are set to either 000 or 011, which means
+ * respectively RW for Supervisor and no access for User, or RO for
+ * Supervisor and no access for user.
+ * Then we use the APG to say whether accesses are according to Page rules or
+ * "all Supervisor" rules (Access to all)
+ * Therefore, we define 2 APG groups. lsb is _PAGE_USER
+ * 0 => No user => 01 (all accesses performed according to page definition)
+ * 1 => User => 00 (all accesses performed as supervisor
+ *                                 according to page definition)
+ */
+#define MD_APG_INIT	0x4fffffff
+
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MD_RPN is written, bits in
  * this register are used to create the TLB entry.
@@ -145,7 +171,14 @@ typedef struct {
 } mm_context_t;
 #endif /* !__ASSEMBLY__ */
 
+#if (PAGE_SHIFT == 12)
 #define mmu_virtual_psize	MMU_PAGE_4K
+#elif (PAGE_SHIFT == 14)
+#define mmu_virtual_psize	MMU_PAGE_16K
+#else
+#error "Unsupported PAGE_SIZE"
+#endif
+
 #define mmu_linear_psize	MMU_PAGE_8M
 
 #endif /* _ASM_POWERPC_MMU_8XX_H_ */
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 1da6a81ce541..a82f5347540a 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -536,6 +536,9 @@ typedef struct {
 	/* for 4K PTE fragment support */
 	void *pte_frag;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	struct list_head iommu_group_mem_list;
+#endif
 } mm_context_t;
 
 
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 73382eba02dc..3e5184210d9b 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -16,6 +16,24 @@
  */
 extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
 extern void destroy_context(struct mm_struct *mm);
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+struct mm_iommu_table_group_mem_t;
+
+extern bool mm_iommu_preregistered(void);
+extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_init(mm_context_t *ctx);
+extern void mm_iommu_cleanup(mm_context_t *ctx);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
+		unsigned long entries);
+extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#endif
 
 extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 0321a909e663..e9e4c52f3685 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -153,7 +153,8 @@
 #define OPAL_FLASH_READ				110
 #define OPAL_FLASH_WRITE			111
 #define OPAL_FLASH_ERASE			112
-#define OPAL_LAST				112
+#define OPAL_PRD_MSG				113
+#define OPAL_LAST				113
 
 /* Device tree flags */
 
@@ -165,6 +166,13 @@
 #define OPAL_PM_WINKLE_ENABLED		0x00040000
 #define OPAL_PM_SLEEP_ENABLED_ER1	0x00080000 /* with workaround */
 
+/*
+ * OPAL_CONFIG_CPU_IDLE_STATE parameters
+ */
+#define OPAL_CONFIG_IDLE_FASTSLEEP	1
+#define OPAL_CONFIG_IDLE_UNDO		0
+#define OPAL_CONFIG_IDLE_APPLY		1
+
 #ifndef __ASSEMBLY__
 
 /* Other enums */
@@ -352,6 +360,7 @@ enum opal_msg_type {
 	OPAL_MSG_SHUTDOWN,		/* params[0] = 1 reboot, 0 shutdown */
 	OPAL_MSG_HMI_EVT,
 	OPAL_MSG_DPO,
+	OPAL_MSG_PRD,
 	OPAL_MSG_TYPE_MAX,
 };
 
@@ -674,6 +683,23 @@ typedef struct oppanel_line {
 	__be64 line_len;
 } oppanel_line_t;
 
+enum opal_prd_msg_type {
+	OPAL_PRD_MSG_TYPE_INIT = 0,	/* HBRT --> OPAL */
+	OPAL_PRD_MSG_TYPE_FINI,		/* HBRT/kernel --> OPAL */
+	OPAL_PRD_MSG_TYPE_ATTN,		/* HBRT <-- OPAL */
+	OPAL_PRD_MSG_TYPE_ATTN_ACK,	/* HBRT --> OPAL */
+	OPAL_PRD_MSG_TYPE_OCC_ERROR,	/* HBRT <-- OPAL */
+	OPAL_PRD_MSG_TYPE_OCC_RESET,	/* HBRT <-- OPAL */
+};
+
+struct opal_prd_msg_header {
+	uint8_t		type;
+	uint8_t		pad[1];
+	__be16		size;
+};
+
+struct opal_prd_msg;
+
 /*
  * SG entries
  *
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 042af1abfc4d..958e941c0cda 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -186,6 +186,7 @@ int64_t opal_handle_hmi(void);
 int64_t opal_register_dump_region(uint32_t id, uint64_t start, uint64_t end);
 int64_t opal_unregister_dump_region(uint32_t id);
 int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
+int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t flag);
 int64_t opal_pci_set_phb_cxl_mode(uint64_t phb_id, uint64_t mode, uint64_t pe_number);
 int64_t opal_ipmi_send(uint64_t interface, struct opal_ipmi_msg *msg,
 		uint64_t msg_len);
@@ -193,6 +194,7 @@ int64_t opal_ipmi_recv(uint64_t interface, struct opal_ipmi_msg *msg,
 		uint64_t *msg_len);
 int64_t opal_i2c_request(uint64_t async_token, uint32_t bus_id,
 			 struct opal_i2c_request *oreq);
+int64_t opal_prd_msg(struct opal_prd_msg *msg);
 
 int64_t opal_flash_read(uint64_t id, uint64_t offset, uint64_t buf,
 		uint64_t size, uint64_t token);
@@ -239,6 +241,10 @@ extern int opal_elog_init(void);
 extern void opal_platform_dump_init(void);
 extern void opal_sys_param_init(void);
 extern void opal_msglog_init(void);
+extern int opal_async_comp_init(void);
+extern int opal_sensor_init(void);
+extern int opal_hmi_handler_init(void);
+extern int opal_event_init(void);
 
 extern int opal_machine_check(struct pt_regs *regs);
 extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
@@ -250,6 +256,8 @@ extern int opal_resync_timebase(void);
 
 extern void opal_lpc_init(void);
 
+extern int opal_event_request(unsigned int opal_event_nr);
+
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
 					     unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 69c059887a2c..71294a6e976e 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -278,9 +278,7 @@ extern long long virt_phys_offset;
 
 #ifndef __ASSEMBLY__
 
-#undef STRICT_MM_TYPECHECKS
-
-#ifdef STRICT_MM_TYPECHECKS
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
 /* These are used to make use of C type-checking. */
 
 /* PTE level */
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 1811c44bf34b..712add590445 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -27,9 +27,23 @@ struct pci_controller_ops {
 	 * allow assignment/enabling of the device. */
 	bool		(*enable_device_hook)(struct pci_dev *);
 
+	void		(*disable_device)(struct pci_dev *);
+
+	void		(*release_device)(struct pci_dev *);
+
 	/* Called during PCI resource reassignment */
 	resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type);
 	void		(*reset_secondary_bus)(struct pci_dev *dev);
+
+#ifdef CONFIG_PCI_MSI
+	int		(*setup_msi_irqs)(struct pci_dev *dev,
+					  int nvec, int type);
+	void		(*teardown_msi_irqs)(struct pci_dev *dev);
+#endif
+
+	int             (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask);
+
+	void		(*shutdown)(struct pci_controller *);
 };
 
 /*
@@ -185,7 +199,7 @@ struct pci_dn {
 
 	struct  pci_dn *parent;
 	struct  pci_controller *phb;	/* for pci devices */
-	struct	iommu_table *iommu_table;	/* for phb's or bridges */
+	struct	iommu_table_group *table_group;	/* for phb's or bridges */
 	struct	device_node *node;	/* back-pointer to the device_node */
 
 	int	pci_ext_config_space;	/* for pci devices */
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 4aef8d660999..99dc432b256a 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -71,36 +71,6 @@ extern struct dma_map_ops *get_pci_dma_ops(void);
  */
 #define PCI_DISABLE_MWI
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
-#else /* 32-bit */
-
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
 #endif /* CONFIG_PPC64 */
 
 extern int pci_domain_nr(struct pci_bus *bus);
diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h
index 64b52b1cf542..9c326565d498 100644
--- a/arch/powerpc/include/asm/pgtable-ppc32.h
+++ b/arch/powerpc/include/asm/pgtable-ppc32.h
@@ -170,24 +170,6 @@ static inline unsigned long pte_update(pte_t *p,
 #ifdef PTE_ATOMIC_UPDATES
 	unsigned long old, tmp;
 
-#ifdef CONFIG_PPC_8xx
-	unsigned long tmp2;
-
-	__asm__ __volatile__("\
-1:	lwarx	%0,0,%4\n\
-	andc	%1,%0,%5\n\
-	or	%1,%1,%6\n\
-	/* 0x200 == Extended encoding, bit 22 */ \
-	/* Bit 22 has to be 1 when _PAGE_USER is unset and _PAGE_RO is set */ \
-	rlwimi	%1,%1,32-1,0x200\n /* get _PAGE_RO */ \
-	rlwinm	%3,%1,32-2,0x200\n /* get _PAGE_USER */ \
-	andc	%1,%1,%3\n\
-	stwcx.	%1,0,%4\n\
-	bne-	1b"
-	: "=&r" (old), "=&r" (tmp), "=m" (*p), "=&r" (tmp2)
-	: "r" (p), "r" (clr), "r" (set), "m" (*p)
-	: "cc" );
-#else /* CONFIG_PPC_8xx */
 	__asm__ __volatile__("\
 1:	lwarx	%0,0,%3\n\
 	andc	%1,%0,%4\n\
@@ -198,7 +180,6 @@ static inline unsigned long pte_update(pte_t *p,
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
 	: "r" (p), "r" (clr), "r" (set), "m" (*p)
 	: "cc" );
-#endif /* CONFIG_PPC_8xx */
 #else /* PTE_ATOMIC_UPDATES */
 	unsigned long old = pte_val(*p);
 	*p = __pte((old & ~clr) | set);
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 43e6ad424c7f..f890f7ce1593 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -118,7 +118,7 @@
  */
 #ifndef __real_pte
 
-#ifdef STRICT_MM_TYPECHECKS
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
 #define __real_pte(e,p)		((real_pte_t){(e)})
 #define __rpte_to_pte(r)	((r).pte)
 #else
@@ -347,11 +347,27 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 /* Encode and de-code a swap entry */
-#define __swp_type(entry)	(((entry).val >> 1) & 0x3f)
-#define __swp_offset(entry)	((entry).val >> 8)
-#define __swp_entry(type, offset) ((swp_entry_t){((type)<< 1)|((offset)<<8)})
-#define __pte_to_swp_entry(pte)	((swp_entry_t){pte_val(pte) >> PTE_RPN_SHIFT})
-#define __swp_entry_to_pte(x)	((pte_t) { (x).val << PTE_RPN_SHIFT })
+#define MAX_SWAPFILES_CHECK() do { \
+	BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \
+	/*							\
+	 * Don't have overlapping bits with _PAGE_HPTEFLAGS	\
+	 * We filter HPTEFLAGS on set_pte.			\
+	 */							\
+	BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
+	} while (0)
+/*
+ * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT;
+ */
+#define SWP_TYPE_BITS 5
+#define __swp_type(x)		(((x).val >> _PAGE_BIT_SWAP_TYPE) \
+				& ((1UL << SWP_TYPE_BITS) - 1))
+#define __swp_offset(x)		((x).val >> PTE_RPN_SHIFT)
+#define __swp_entry(type, offset)	((swp_entry_t) { \
+					((type) << _PAGE_BIT_SWAP_TYPE) \
+					| ((offset) << PTE_RPN_SHIFT) })
+
+#define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val((pte)) })
+#define __swp_entry_to_pte(x)		__pte((x).val)
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index f9b498292a5c..6f77f71ee964 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -11,7 +11,7 @@
 #define _ASM_PNV_PCI_H
 
 #include <linux/pci.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode);
 int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index 5c93f691b495..8452335661a5 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -136,6 +136,8 @@
 #define PPC_INST_DCBAL			0x7c2005ec
 #define PPC_INST_DCBZL			0x7c2007ec
 #define PPC_INST_ICBT			0x7c00002c
+#define PPC_INST_ICSWX			0x7c00032d
+#define PPC_INST_ICSWEPX		0x7c00076d
 #define PPC_INST_ISEL			0x7c00001e
 #define PPC_INST_ISEL_MASK		0xfc00003e
 #define PPC_INST_LDARX			0x7c0000a8
@@ -403,4 +405,15 @@
 #define MFTMR(tmr, r)		stringify_in_c(.long PPC_INST_MFTMR | \
 					       TMRN(tmr) | ___PPC_RT(r))
 
+/* Coprocessor instructions */
+#define PPC_ICSWX(s, a, b)	stringify_in_c(.long PPC_INST_ICSWX |	\
+					       ___PPC_RS(s) |		\
+					       ___PPC_RA(a) |		\
+					       ___PPC_RB(b))
+#define PPC_ICSWEPX(s, a, b)	stringify_in_c(.long PPC_INST_ICSWEPX | \
+					       ___PPC_RS(s) |		\
+					       ___PPC_RA(a) |		\
+					       ___PPC_RB(b))
+
+
 #endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index bf117d8fb45f..28ded5d9b579 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -295,6 +295,15 @@ struct thread_struct {
 #endif
 #ifdef CONFIG_PPC64
 	unsigned long	dscr;
+	/*
+	 * This member element dscr_inherit indicates that the process
+	 * has explicitly attempted and changed the DSCR register value
+	 * for itself. Hence kernel wont use the default CPU DSCR value
+	 * contained in the PACA structure anymore during process context
+	 * switch. Once this variable is set, this behaviour will also be
+	 * inherited to all the children of this process from that point
+	 * onwards.
+	 */
 	int		dscr_inherit;
 	unsigned long	ppr;	/* used to save/restore SMT priority */
 #endif
diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h
index 97bae64afdaa..a0e2ba960976 100644
--- a/arch/powerpc/include/asm/pte-8xx.h
+++ b/arch/powerpc/include/asm/pte-8xx.h
@@ -34,35 +34,32 @@
 #define _PAGE_SPECIAL	0x0008	/* SW entry, forced to 0 by the TLB miss */
 #define _PAGE_DIRTY	0x0100	/* C: page changed */
 
-/* These 4 software bits must be masked out when the entry is loaded
- * into the TLB, 1 SW bit left(0x0080).
+/* These 4 software bits must be masked out when the L2 entry is loaded
+ * into the TLB.
  */
-#define _PAGE_GUARDED	0x0010	/* software: guarded access */
-#define _PAGE_ACCESSED	0x0020	/* software: page referenced */
-#define _PAGE_WRITETHRU	0x0040	/* software: caching is write through */
+#define _PAGE_GUARDED	0x0010	/* Copied to L1 G entry in DTLB */
+#define _PAGE_USER	0x0020	/* Copied to L1 APG lsb */
+#define _PAGE_EXEC	0x0040	/* Copied to L1 APG */
+#define _PAGE_WRITETHRU	0x0080	/* software: caching is write through */
+#define _PAGE_ACCESSED	0x0800	/* software: page referenced */
 
-/* Setting any bits in the nibble with the follow two controls will
- * require a TLB exception handler change.  It is assumed unused bits
- * are always zero.
- */
-#define _PAGE_RO	0x0400	/* lsb PP bits */
-#define _PAGE_USER	0x0800	/* msb PP bits */
-/* set when _PAGE_USER is unset and _PAGE_RO is set */
-#define _PAGE_KNLRO	0x0200
+#define _PAGE_RO	0x0600	/* Supervisor RO, User no access */
 
 #define _PMD_PRESENT	0x0001
 #define _PMD_BAD	0x0ff0
 #define _PMD_PAGE_MASK	0x000c
 #define _PMD_PAGE_8M	0x000c
 
-#define _PTE_NONE_MASK _PAGE_KNLRO
-
 /* Until my rework is finished, 8xx still needs atomic PTE updates */
 #define PTE_ATOMIC_UPDATES	1
 
 /* We need to add _PAGE_SHARED to kernel pages */
-#define _PAGE_KERNEL_RO	(_PAGE_SHARED | _PAGE_RO | _PAGE_KNLRO)
-#define _PAGE_KERNEL_ROX	(_PAGE_EXEC | _PAGE_RO | _PAGE_KNLRO)
+#define _PAGE_KERNEL_RO		(_PAGE_SHARED | _PAGE_RO)
+#define _PAGE_KERNEL_ROX	(_PAGE_SHARED | _PAGE_RO | _PAGE_EXEC)
+#define _PAGE_KERNEL_RW		(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
+				 _PAGE_HWWRITE)
+#define _PAGE_KERNEL_RWX	(_PAGE_SHARED | _PAGE_DIRTY | _PAGE_RW | \
+				 _PAGE_HWWRITE | _PAGE_EXEC)
 
 #endif /* __KERNEL__ */
 #endif /*  _ASM_POWERPC_PTE_8xx_H */
diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h
index 91a704952ca1..8d8473278d91 100644
--- a/arch/powerpc/include/asm/pte-book3e.h
+++ b/arch/powerpc/include/asm/pte-book3e.h
@@ -11,6 +11,7 @@
 /* Architected bits */
 #define _PAGE_PRESENT	0x000001 /* software: pte contains a translation */
 #define _PAGE_SW1	0x000002
+#define _PAGE_BIT_SWAP_TYPE	2
 #define _PAGE_BAP_SR	0x000004
 #define _PAGE_BAP_UR	0x000008
 #define _PAGE_BAP_SW	0x000010
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h
index c5a755ef7011..b7c8d079c121 100644
--- a/arch/powerpc/include/asm/pte-common.h
+++ b/arch/powerpc/include/asm/pte-common.h
@@ -85,10 +85,8 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void);
  * 64-bit PTEs
  */
 #if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT)
-#define PTE_RPN_MAX	(1ULL << (64 - PTE_RPN_SHIFT))
 #define PTE_RPN_MASK	(~((1ULL<<PTE_RPN_SHIFT)-1))
 #else
-#define PTE_RPN_MAX	(1UL << (32 - PTE_RPN_SHIFT))
 #define PTE_RPN_MASK	(~((1UL<<PTE_RPN_SHIFT)-1))
 #endif
 
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index fc852f7e7b3a..ef612c160da7 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -16,6 +16,7 @@
  */
 #define _PAGE_PRESENT		0x0001 /* software: pte contains a translation */
 #define _PAGE_USER		0x0002 /* matches one of the PP bits */
+#define _PAGE_BIT_SWAP_TYPE	2
 #define _PAGE_EXEC		0x0004 /* No execute on POWER4 and newer (we invert) */
 #define _PAGE_GUARDED		0x0008
 /* We can derive Memory coherence from _PAGE_NO_CACHE */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index f1863a138b4a..71f2b3f02cf8 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -358,7 +358,7 @@ SYSCALL_SPU(setns)
 COMPAT_SYS(process_vm_readv)
 COMPAT_SYS(process_vm_writev)
 SYSCALL(finit_module)
-SYSCALL(ni_syscall) /* sys_kcmp */
+SYSCALL(kcmp) /* sys_kcmp */
 SYSCALL_SPU(sched_setattr)
 SYSCALL_SPU(sched_getattr)
 SYSCALL_SPU(renameat2)
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 5f1048eaa5b6..8b3b46b7b0f2 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -87,7 +87,7 @@ static inline int prrn_is_enabled(void)
 #include <asm/smp.h>
 
 #define topology_physical_package_id(cpu)	(cpu_to_chip_id(cpu))
-#define topology_thread_cpumask(cpu)	(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)	(per_cpu(cpu_sibling_map, cpu))
 #define topology_core_cpumask(cpu)	(per_cpu(cpu_core_map, cpu))
 #define topology_core_id(cpu)		(cpu_to_core_id(cpu))
 #endif
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index c15da6073cb8..8e86b48d0369 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -144,6 +144,26 @@ TRACE_EVENT_FN(opal_exit,
 );
 #endif
 
+TRACE_EVENT(hash_fault,
+
+	    TP_PROTO(unsigned long addr, unsigned long access, unsigned long trap),
+	    TP_ARGS(addr, access, trap),
+	    TP_STRUCT__entry(
+		    __field(unsigned long, addr)
+		    __field(unsigned long, access)
+		    __field(unsigned long, trap)
+		    ),
+
+	    TP_fast_assign(
+		    __entry->addr = addr;
+		    __entry->access = access;
+		    __entry->trap = trap;
+		    ),
+
+	    TP_printk("hash fault with addr 0x%lx and access = 0x%lx trap = 0x%lx",
+		      __entry->addr, __entry->access, __entry->trap)
+);
+
 #endif /* _TRACE_POWERPC_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index a0c071d24e0e..2a8ebae0936b 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -265,7 +265,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	unsigned long __gu_val;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	if (!is_kernel_addr((unsigned long)__gu_addr))		\
 		might_fault();					\
@@ -279,7 +279,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	long long __gu_val;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	if (!is_kernel_addr((unsigned long)__gu_addr))		\
 		might_fault();					\
@@ -293,7 +293,7 @@ do {								\
 ({									\
 	long __gu_err = -EFAULT;					\
 	unsigned long  __gu_val = 0;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);		\
 	might_fault();							\
 	if (access_ok(VERIFY_READ, __gu_addr, (size)))			\
 		__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
@@ -305,7 +305,7 @@ do {								\
 ({								\
 	long __gu_err;						\
 	unsigned long __gu_val;					\
-	const __typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
+	__typeof__(*(ptr)) __user *__gu_addr = (ptr);	\
 	__chk_user_ptr(ptr);					\
 	__get_user_size(__gu_val, __gu_addr, (size), __gu_err);	\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index 79c4068be278..f44a027818af 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -18,6 +18,7 @@ header-y += kvm_para.h
 header-y += mman.h
 header-y += msgbuf.h
 header-y += nvram.h
+header-y += opal-prd.h
 header-y += param.h
 header-y += perf_event.h
 header-y += poll.h
diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h
index de2c0e4ee1aa..43686043e297 100644
--- a/arch/powerpc/include/uapi/asm/cputable.h
+++ b/arch/powerpc/include/uapi/asm/cputable.h
@@ -42,5 +42,6 @@
 #define PPC_FEATURE2_ISEL		0x08000000
 #define PPC_FEATURE2_TAR		0x04000000
 #define PPC_FEATURE2_VEC_CRYPTO		0x02000000
+#define PPC_FEATURE2_HTM_NOSC		0x01000000
 
 #endif /* _UAPI__ASM_POWERPC_CPUTABLE_H */
diff --git a/arch/powerpc/include/uapi/asm/eeh.h b/arch/powerpc/include/uapi/asm/eeh.h
new file mode 100644
index 000000000000..291b7d1814a6
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/eeh.h
@@ -0,0 +1,56 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2015
+ *
+ * Authors: Gavin Shan <gwshan@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_POWERPC_EEH_H
+#define _ASM_POWERPC_EEH_H
+
+/* PE states */
+#define EEH_PE_STATE_NORMAL		0	/* Normal state		*/
+#define EEH_PE_STATE_RESET		1	/* PE reset asserted	*/
+#define EEH_PE_STATE_STOPPED_IO_DMA	2	/* Frozen PE		*/
+#define EEH_PE_STATE_STOPPED_DMA	4	/* Stopped DMA only	*/
+#define EEH_PE_STATE_UNAVAIL		5	/* Unavailable		*/
+
+/* EEH error types and functions */
+#define EEH_ERR_TYPE_32			0       /* 32-bits error	*/
+#define EEH_ERR_TYPE_64			1       /* 64-bits error	*/
+#define EEH_ERR_FUNC_MIN		0
+#define EEH_ERR_FUNC_LD_MEM_ADDR	0	/* Memory load	*/
+#define EEH_ERR_FUNC_LD_MEM_DATA	1
+#define EEH_ERR_FUNC_LD_IO_ADDR		2	/* IO load	*/
+#define EEH_ERR_FUNC_LD_IO_DATA		3
+#define EEH_ERR_FUNC_LD_CFG_ADDR	4	/* Config load	*/
+#define EEH_ERR_FUNC_LD_CFG_DATA	5
+#define EEH_ERR_FUNC_ST_MEM_ADDR	6	/* Memory store	*/
+#define EEH_ERR_FUNC_ST_MEM_DATA	7
+#define EEH_ERR_FUNC_ST_IO_ADDR		8	/* IO store	*/
+#define EEH_ERR_FUNC_ST_IO_DATA		9
+#define EEH_ERR_FUNC_ST_CFG_ADDR	10	/* Config store	*/
+#define EEH_ERR_FUNC_ST_CFG_DATA	11
+#define EEH_ERR_FUNC_DMA_RD_ADDR	12	/* DMA read	*/
+#define EEH_ERR_FUNC_DMA_RD_DATA	13
+#define EEH_ERR_FUNC_DMA_RD_MASTER	14
+#define EEH_ERR_FUNC_DMA_RD_TARGET	15
+#define EEH_ERR_FUNC_DMA_WR_ADDR	16	/* DMA write	*/
+#define EEH_ERR_FUNC_DMA_WR_DATA	17
+#define EEH_ERR_FUNC_DMA_WR_MASTER	18
+#define EEH_ERR_FUNC_DMA_WR_TARGET	19
+#define EEH_ERR_FUNC_MAX		19
+
+#endif /* _ASM_POWERPC_EEH_H */
diff --git a/arch/powerpc/include/uapi/asm/opal-prd.h b/arch/powerpc/include/uapi/asm/opal-prd.h
new file mode 100644
index 000000000000..319ff4a26158
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/opal-prd.h
@@ -0,0 +1,58 @@
+/*
+ * OPAL Runtime Diagnostics interface driver
+ * Supported on POWERNV platform
+ *
+ * (C) Copyright IBM 2015
+ *
+ * Author: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
+ * Author: Jeremy Kerr <jk@ozlabs.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _UAPI_ASM_POWERPC_OPAL_PRD_H_
+#define _UAPI_ASM_POWERPC_OPAL_PRD_H_
+
+#include <linux/types.h>
+
+/**
+ * The version of the kernel interface of the PRD system. This describes the
+ * interface available for the /dev/opal-prd device. The actual PRD message
+ * layout and content is private to the firmware <--> userspace interface, so
+ * is not covered by this versioning.
+ *
+ * Future interface versions are backwards-compatible; if a later kernel
+ * version is encountered, functionality provided in earlier versions
+ * will work.
+ */
+#define OPAL_PRD_KERNEL_VERSION		1
+
+#define OPAL_PRD_GET_INFO		_IOR('o', 0x01, struct opal_prd_info)
+#define OPAL_PRD_SCOM_READ		_IOR('o', 0x02, struct opal_prd_scom)
+#define OPAL_PRD_SCOM_WRITE		_IOW('o', 0x03, struct opal_prd_scom)
+
+#ifndef __ASSEMBLY__
+
+struct opal_prd_info {
+	__u64	version;
+	__u64	reserved[3];
+};
+
+struct opal_prd_scom {
+	__u64	chip;
+	__u64	addr;
+	__u64	data;
+	__s64	rc;
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _UAPI_ASM_POWERPC_OPAL_PRD_H */
diff --git a/arch/powerpc/include/uapi/asm/tm.h b/arch/powerpc/include/uapi/asm/tm.h
index 5d836b7c1176..5047659815a5 100644
--- a/arch/powerpc/include/uapi/asm/tm.h
+++ b/arch/powerpc/include/uapi/asm/tm.h
@@ -11,7 +11,7 @@
 #define TM_CAUSE_RESCHED	0xde
 #define TM_CAUSE_TLBI		0xdc
 #define TM_CAUSE_FAC_UNAV	0xda
-#define TM_CAUSE_SYSCALL	0xd8  /* future use */
+#define TM_CAUSE_SYSCALL	0xd8
 #define TM_CAUSE_MISC		0xd6  /* future use */
 #define TM_CAUSE_SIGNAL		0xd4
 #define TM_CAUSE_ALIGNMENT	0xd2
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index c1ebbdaac28f..87c7d1473488 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -33,11 +33,12 @@ obj-y				:= cputable.o ptrace.o syscalls.o \
 				   signal.o sysfs.o cacheinfo.o time.o \
 				   prom.o traps.o setup-common.o \
 				   udbg.o misc.o io.o dma.o \
-				   misc_$(CONFIG_WORD_SIZE).o vdso32/ \
+				   misc_$(CONFIG_WORD_SIZE).o \
 				   of_platform.o prom_parse.o
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o nvram_64.o firmware.o
+obj-$(CONFIG_VDSO32)		+= vdso32/
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0034b6b3556a..98230579d99c 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -247,7 +247,7 @@ int main(void)
 #endif
 	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
 	DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
-	DEFINE(PACA_DSCR, offsetof(struct paca_struct, dscr_default));
+	DEFINE(PACA_DSCR_DEFAULT, offsetof(struct paca_struct, dscr_default));
 	DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime));
 	DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user));
 	DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time));
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 60262fdf35ba..7d80bfdfb15e 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -108,7 +108,9 @@ extern void __restore_cpu_e6500(void);
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_PSERIES_PERFMON_COMPAT)
 #define COMMON_USER2_POWER8	(PPC_FEATURE2_ARCH_2_07 | \
-				 PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_DSCR | \
+				 PPC_FEATURE2_HTM_COMP | \
+				 PPC_FEATURE2_HTM_NOSC_COMP | \
+				 PPC_FEATURE2_DSCR | \
 				 PPC_FEATURE2_ISEL | PPC_FEATURE2_TAR | \
 				 PPC_FEATURE2_VEC_CRYPTO)
 #define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 484b2d4462c1..35e4dcc5dce3 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -248,6 +248,14 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 {
 	if (ppc_md.dma_set_mask)
 		return ppc_md.dma_set_mask(dev, dma_mask);
+
+	if (dev_is_pci(dev)) {
+		struct pci_dev *pdev = to_pci_dev(dev);
+		struct pci_controller *phb = pci_bus_to_host(pdev->bus);
+		if (phb->controller_ops.dma_set_mask)
+			return phb->controller_ops.dma_set_mask(pdev, dma_mask);
+	}
+
 	return __dma_set_mask(dev, dma_mask);
 }
 EXPORT_SYMBOL(dma_set_mask);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9ee61d15653d..af9b597b10af 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -144,8 +144,6 @@ struct eeh_stats {
 
 static struct eeh_stats eeh_stats;
 
-#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
-
 static int __init eeh_setup(char *str)
 {
 	if (!strcmp(str, "off"))
@@ -719,7 +717,7 @@ static void *eeh_restore_dev_state(void *data, void *userdata)
 
 	/* The caller should restore state for the specified device */
 	if (pdev != dev)
-		pci_save_state(pdev);
+		pci_restore_state(pdev);
 
 	return NULL;
 }
@@ -1412,13 +1410,11 @@ static int dev_has_iommu_table(struct device *dev, void *data)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct pci_dev **ppdev = data;
-	struct iommu_table *tbl;
 
 	if (!dev)
 		return 0;
 
-	tbl = get_iommu_table_base(dev);
-	if (tbl && tbl->it_group) {
+	if (dev->iommu_group) {
 		*ppdev = pdev;
 		return 1;
 	}
@@ -1647,6 +1643,41 @@ int eeh_pe_configure(struct eeh_pe *pe)
 }
 EXPORT_SYMBOL_GPL(eeh_pe_configure);
 
+/**
+ * eeh_pe_inject_err - Injecting the specified PCI error to the indicated PE
+ * @pe: the indicated PE
+ * @type: error type
+ * @function: error function
+ * @addr: address
+ * @mask: address mask
+ *
+ * The routine is called to inject the specified PCI error, which
+ * is determined by @type and @function, to the indicated PE for
+ * testing purpose.
+ */
+int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
+		      unsigned long addr, unsigned long mask)
+{
+	/* Invalid PE ? */
+	if (!pe)
+		return -ENODEV;
+
+	/* Unsupported operation ? */
+	if (!eeh_ops || !eeh_ops->err_inject)
+		return -ENOENT;
+
+	/* Check on PCI error type */
+	if (type != EEH_ERR_TYPE_32 && type != EEH_ERR_TYPE_64)
+		return -EINVAL;
+
+	/* Check on PCI error function */
+	if (func < EEH_ERR_FUNC_MIN || func > EEH_ERR_FUNC_MAX)
+		return -EINVAL;
+
+	return eeh_ops->err_inject(pe, type, func, addr, mask);
+}
+EXPORT_SYMBOL_GPL(eeh_pe_inject_err);
+
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
 	if (!eeh_enabled()) {
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index eeabeabea49c..a1e86e172e3c 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -48,11 +48,11 @@
  */
 struct pci_io_addr_range {
 	struct rb_node rb_node;
-	unsigned long addr_lo;
-	unsigned long addr_hi;
+	resource_size_t addr_lo;
+	resource_size_t addr_hi;
 	struct eeh_dev *edev;
 	struct pci_dev *pcidev;
-	unsigned int flags;
+	unsigned long flags;
 };
 
 static struct pci_io_addr_cache {
@@ -125,8 +125,8 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache *cache)
 
 /* Insert address range into the rb tree. */
 static struct pci_io_addr_range *
-eeh_addr_cache_insert(struct pci_dev *dev, unsigned long alo,
-		      unsigned long ahi, unsigned int flags)
+eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t alo,
+		      resource_size_t ahi, unsigned long flags)
 {
 	struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node;
 	struct rb_node *parent = NULL;
@@ -197,9 +197,9 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
 
 	/* Walk resources on this device, poke them into the tree */
 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
-		unsigned long start = pci_resource_start(dev,i);
-		unsigned long end = pci_resource_end(dev,i);
-		unsigned int flags = pci_resource_flags(dev,i);
+		resource_size_t start = pci_resource_start(dev,i);
+		resource_size_t end = pci_resource_end(dev,i);
+		unsigned long flags = pci_resource_flags(dev,i);
 
 		/* We are interested only bus addresses, not dma or other stuff */
 		if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM)))
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 24768ff3cb73..89eb4bc34d3a 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -660,7 +660,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	eeh_pe_dev_traverse(pe, eeh_report_error, &result);
 
 	/* Get the current PCI slot state. This can take a long time,
-	 * sometimes over 3 seconds for certain systems.
+	 * sometimes over 300 seconds for certain systems.
 	 */
 	rc = eeh_ops->wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index afbc20019c2e..579e0f9a2d57 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -34,6 +34,7 @@
 #include <asm/ftrace.h>
 #include <asm/hw_irq.h>
 #include <asm/context_tracking.h>
+#include <asm/tm.h>
 
 /*
  * System calls.
@@ -51,6 +52,12 @@ exception_marker:
 
 	.globl system_call_common
 system_call_common:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+	extrdi.	r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
+	bne	tabort_syscall
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
 	andi.	r10,r12,MSR_PR
 	mr	r10,r1
 	addi	r1,r1,-INT_FRAME_SIZE
@@ -311,6 +318,34 @@ syscall_exit_work:
 	bl	do_syscall_trace_leave
 	b	ret_from_except
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+tabort_syscall:
+	/* Firstly we need to enable TM in the kernel */
+	mfmsr	r10
+	li	r13, 1
+	rldimi	r10, r13, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r10, 0
+
+	/* tabort, this dooms the transaction, nothing else */
+	li	r13, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)
+	TABORT(R13)
+
+	/*
+	 * Return directly to userspace. We have corrupted user register state,
+	 * but userspace will never see that register state. Execution will
+	 * resume after the tbegin of the aborted transaction with the
+	 * checkpointed register state.
+	 */
+	li	r13, MSR_RI
+	andc	r10, r10, r13
+	mtmsrd	r10, 1
+	mtspr	SPRN_SRR0, r11
+	mtspr	SPRN_SRR1, r12
+
+	rfid
+	b	.	/* prevent speculative execution */
+#endif
+
 /* Save non-volatile GPRs, if not already saved. */
 _GLOBAL(save_nvgprs)
 	ld	r11,_TRAP(r1)
@@ -556,7 +591,7 @@ BEGIN_FTR_SECTION
 	ld	r0,THREAD_DSCR(r4)
 	cmpwi	r6,0
 	bne	1f
-	ld	r0,PACA_DSCR(r13)
+	ld	r0,PACA_DSCR_DEFAULT(r13)
 1:
 BEGIN_FTR_SECTION_NESTED(70)
 	mfspr	r8, SPRN_FSCR
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 9519e6bdc6d7..0a0399c2af11 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -59,14 +59,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 
 #if defined(CONFIG_RELOCATABLE)
 	/*
-	 * We can't branch directly; in the direct case we use LR
-	 * and system_call_entry restores LR.  (We thus need to move
-	 * LR to r10 in the RFID case too.)
+	 * We can't branch directly so we do it via the CTR which
+	 * is volatile across system calls.
 	 */
 #define SYSCALL_PSERIES_2_DIRECT				\
 	mflr	r10 ;						\
 	ld	r12,PACAKBASE(r13) ; 				\
-	LOAD_HANDLER(r12, system_call_entry_direct) ;		\
+	LOAD_HANDLER(r12, system_call_entry) ;			\
 	mtctr	r12 ;						\
 	mfspr	r12,SPRN_SRR1 ;					\
 	/* Re-use of r13... No spare regs to do this */	\
@@ -80,7 +79,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	mfspr	r12,SPRN_SRR1 ;					\
 	li	r10,MSR_RI ;					\
 	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
-	b	system_call_entry_direct ;
+	b	system_call_common ;
 #endif
 
 /*
@@ -969,13 +968,6 @@ hv_facility_unavailable_relon_trampoline:
 __end_interrupts:
 
 	.align	7
-system_call_entry_direct:
-#if defined(CONFIG_RELOCATABLE)
-	/* The first level prologue may have used LR to get here, saving
-	 * orig in r10.  To save hacking/ifdeffing common code, restore here.
-	 */
-	mtlr	r10
-#endif
 system_call_entry:
 	b	system_call_common
 
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9b53fe139bf6..78c1eba4c04a 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -48,6 +48,19 @@
 	mtspr	spr, reg
 #endif
 
+/* Macro to test if an address is a kernel address */
+#if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000
+#define IS_KERNEL(tmp, addr)		\
+	andis.	tmp, addr, 0x8000	/* Address >= 0x80000000 */
+#define BRANCH_UNLESS_KERNEL(label)	beq	label
+#else
+#define IS_KERNEL(tmp, addr)		\
+	rlwinm	tmp, addr, 16, 16, 31;	\
+	cmpli	cr0, tmp, PAGE_OFFSET >> 16
+#define BRANCH_UNLESS_KERNEL(label)	blt	label
+#endif
+
+
 /*
  * Value for the bits that have fixed value in RPN entries.
  * Also used for tagging DAR for DTLBerror.
@@ -116,13 +129,13 @@ turn_on_mmu:
  */
 #define EXCEPTION_PROLOG	\
 	EXCEPTION_PROLOG_0;	\
+	mfcr	r10;		\
 	EXCEPTION_PROLOG_1;	\
 	EXCEPTION_PROLOG_2
 
 #define EXCEPTION_PROLOG_0	\
 	mtspr	SPRN_SPRG_SCRATCH0,r10;	\
-	mtspr	SPRN_SPRG_SCRATCH1,r11;	\
-	mfcr	r10
+	mtspr	SPRN_SPRG_SCRATCH1,r11
 
 #define EXCEPTION_PROLOG_1	\
 	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel */ \
@@ -162,7 +175,6 @@ turn_on_mmu:
  * Exception exit code.
  */
 #define EXCEPTION_EPILOG_0	\
-	mtcr	r10;		\
 	mfspr	r10,SPRN_SPRG_SCRATCH0;	\
 	mfspr	r11,SPRN_SPRG_SCRATCH1
 
@@ -297,19 +309,22 @@ SystemCall:
  * We have to use the MD_xxx registers for the tablewalk because the
  * equivalent MI_xxx registers only perform the attribute functions.
  */
+
+#ifdef CONFIG_8xx_CPU15
+#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)	\
+	addi	tmp, addr, PAGE_SIZE;	\
+	tlbie	tmp;			\
+	addi	tmp, addr, -PAGE_SIZE;	\
+	tlbie	tmp
+#else
+#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)
+#endif
+
 InstructionTLBMiss:
 #ifdef CONFIG_8xx_CPU6
-	mtspr	SPRN_DAR, r3
+	mtspr	SPRN_SPRG_SCRATCH2, r3
 #endif
 	EXCEPTION_PROLOG_0
-	mtspr	SPRN_SPRG_SCRATCH2, r10
-	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
-#ifdef CONFIG_8xx_CPU15
-	addi	r11, r10, PAGE_SIZE
-	tlbie	r11
-	addi	r11, r10, -PAGE_SIZE
-	tlbie	r11
-#endif
 
 	/* If we are faulting a kernel address, we have to use the
 	 * kernel page tables.
@@ -317,24 +332,34 @@ InstructionTLBMiss:
 #ifdef CONFIG_MODULES
 	/* Only modules will cause ITLB Misses as we always
 	 * pin the first 8MB of kernel memory */
-	andis.	r11, r10, 0x8000	/* Address >= 0x80000000 */
-#endif
+	mfspr	r11, SPRN_SRR0	/* Get effective address of fault */
+	INVALIDATE_ADJACENT_PAGES_CPU15(r10, r11)
+	mfcr	r10
+	IS_KERNEL(r11, r11)
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-#ifdef CONFIG_MODULES
-	beq	3f
+	BRANCH_UNLESS_KERNEL(3f)
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 3:
+	mtcr	r10
+	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
+#else
+	mfspr	r10, SPRN_SRR0	/* Get effective address of fault */
+	INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
+	mfspr	r11, SPRN_M_TW	/* Get level 1 table base address */
 #endif
 	/* Insert level 1 index */
 	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
 	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
 
-	/* Load the MI_TWC with the attributes for this "segment." */
-	MTSPR_CPU6(SPRN_MI_TWC, r11, r3)	/* Set segment attributes */
-	rlwinm	r11, r11,0,0,19	/* Extract page descriptor page address */
 	/* Extract level 2 index */
 	rlwinm	r10, r10, 32 - (PAGE_SHIFT - 2), 32 - PAGE_SHIFT, 29
-	lwzx	r10, r10, r11	/* Get the pte */
+	rlwimi	r10, r11, 0, 0, 32 - PAGE_SHIFT - 1	/* Add level 2 base */
+	lwz	r10, 0(r10)	/* Get the pte */
+
+	/* Insert the APG into the TWC from the Linux PTE. */
+	rlwimi	r11, r10, 0, 25, 26
+	/* Load the MI_TWC with the attributes for this "segment." */
+	MTSPR_CPU6(SPRN_MI_TWC, r11, r3)	/* Set segment attributes */
 
 #ifdef CONFIG_SWAP
 	rlwinm	r11, r10, 32-5, _PAGE_PRESENT
@@ -343,40 +368,41 @@ InstructionTLBMiss:
 #endif
 	li	r11, RPN_PATTERN
 	/* The Linux PTE won't go exactly into the MMU TLB.
-	 * Software indicator bits 21 and 28 must be clear.
+	 * Software indicator bits 20-23 and 28 must be clear.
 	 * Software indicator bits 24, 25, 26, and 27 must be
 	 * set.  All other Linux PTE bits control the behavior
 	 * of the MMU.
 	 */
-	rlwimi	r10, r11, 0, 0x07f8	/* Set 24-27, clear 21-23,28 */
+	rlwimi	r10, r11, 0, 0x0ff8	/* Set 24-27, clear 20-23,28 */
 	MTSPR_CPU6(SPRN_MI_RPN, r10, r3)	/* Update TLB entry */
 
 	/* Restore registers */
 #ifdef CONFIG_8xx_CPU6
-	mfspr	r3, SPRN_DAR
-	mtspr	SPRN_DAR, r11	/* Tag DAR */
+	mfspr	r3, SPRN_SPRG_SCRATCH2
 #endif
-	mfspr	r10, SPRN_SPRG_SCRATCH2
 	EXCEPTION_EPILOG_0
 	rfi
 
 	. = 0x1200
 DataStoreTLBMiss:
 #ifdef CONFIG_8xx_CPU6
-	mtspr	SPRN_DAR, r3
+	mtspr	SPRN_SPRG_SCRATCH2, r3
 #endif
 	EXCEPTION_PROLOG_0
-	mtspr	SPRN_SPRG_SCRATCH2, r10
-	mfspr	r10, SPRN_MD_EPN
+	mfcr	r10
 
 	/* If we are faulting a kernel address, we have to use the
 	 * kernel page tables.
 	 */
-	andis.	r11, r10, 0x8000
+	mfspr	r11, SPRN_MD_EPN
+	IS_KERNEL(r11, r11)
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-	beq	3f
+	BRANCH_UNLESS_KERNEL(3f)
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 3:
+	mtcr	r10
+	mfspr	r10, SPRN_MD_EPN
+
 	/* Insert level 1 index */
 	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
 	lwz	r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)	/* Get the level 1 entry */
@@ -388,13 +414,13 @@ DataStoreTLBMiss:
 	rlwimi	r10, r11, 0, 0, 32 - PAGE_SHIFT - 1	/* Add level 2 base */
 	lwz	r10, 0(r10)	/* Get the pte */
 
-	/* Insert the Guarded flag into the TWC from the Linux PTE.
-	 * It is bit 27 of both the Linux PTE and the TWC (at least
+	/* Insert the Guarded flag and APG into the TWC from the Linux PTE.
+	 * It is bit 26-27 of both the Linux PTE and the TWC (at least
 	 * I got that right :-).  It will be better when we can put
 	 * this into the Linux pgd/pmd and load it in the operation
 	 * above.
 	 */
-	rlwimi	r11, r10, 0, 27, 27
+	rlwimi	r11, r10, 0, 26, 27
 	/* Insert the WriteThru flag into the TWC from the Linux PTE.
 	 * It is bit 25 in the Linux PTE and bit 30 in the TWC
 	 */
@@ -423,14 +449,14 @@ DataStoreTLBMiss:
 	 */
 	li	r11, RPN_PATTERN
 	rlwimi	r10, r11, 0, 24, 28	/* Set 24-27, clear 28 */
+	rlwimi	r10, r11, 0, 20, 20	/* clear 20 */
 	MTSPR_CPU6(SPRN_MD_RPN, r10, r3)	/* Update TLB entry */
 
 	/* Restore registers */
 #ifdef CONFIG_8xx_CPU6
-	mfspr	r3, SPRN_DAR
+	mfspr	r3, SPRN_SPRG_SCRATCH2
 #endif
 	mtspr	SPRN_DAR, r11	/* Tag DAR */
-	mfspr	r10, SPRN_SPRG_SCRATCH2
 	EXCEPTION_EPILOG_0
 	rfi
 
@@ -456,6 +482,7 @@ InstructionTLBError:
 	. = 0x1400
 DataTLBError:
 	EXCEPTION_PROLOG_0
+	mfcr	r10
 
 	mfspr	r11, SPRN_DAR
 	cmpwi	cr0, r11, RPN_PATTERN
@@ -503,9 +530,9 @@ FixupDAR:/* Entry point for dcbx workaround. */
 	mtspr	SPRN_SPRG_SCRATCH2, r10
 	/* fetch instruction from memory. */
 	mfspr	r10, SPRN_SRR0
-	andis.	r11, r10, 0x8000	/* Address >= 0x80000000 */
+	IS_KERNEL(r11, r10)
 	mfspr	r11, SPRN_M_TW	/* Get level 1 table */
-	beq	3f
+	BRANCH_UNLESS_KERNEL(3f)
 	lis	r11, (swapper_pg_dir-PAGE_OFFSET)@ha
 	/* Insert level 1 index */
 3:	rlwimi	r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
@@ -743,15 +770,20 @@ initial_mmu:
 	ori	r8, r8, MI_EVALID	/* Mark it valid */
 	mtspr	SPRN_MI_EPN, r8
 	mtspr	SPRN_MD_EPN, r8
-	li	r8, MI_PS8MEG		/* Set 8M byte page */
+	li	r8, MI_PS8MEG | (2 << 5)	/* Set 8M byte page, APG 2 */
 	ori	r8, r8, MI_SVALID	/* Make it valid */
 	mtspr	SPRN_MI_TWC, r8
+	li	r8, MI_PS8MEG		/* Set 8M byte page, APG 0 */
+	ori	r8, r8, MI_SVALID	/* Make it valid */
 	mtspr	SPRN_MD_TWC, r8
 	li	r8, MI_BOOTINIT		/* Create RPN for address 0 */
 	mtspr	SPRN_MI_RPN, r8		/* Store TLB entry */
 	mtspr	SPRN_MD_RPN, r8
-	lis	r8, MI_Kp@h		/* Set the protection mode */
+	lis	r8, MI_APG_INIT@h	/* Set protection modes */
+	ori	r8, r8, MI_APG_INIT@l
 	mtspr	SPRN_MI_AP, r8
+	lis	r8, MD_APG_INIT@h
+	ori	r8, r8, MD_APG_INIT@l
 	mtspr	SPRN_MD_AP, r8
 
 	/* Map another 8 MByte at the IMMR to get the processor
diff --git a/arch/powerpc/kernel/idle_e500.S b/arch/powerpc/kernel/idle_e500.S
index 15448668988d..b9b6ef510be1 100644
--- a/arch/powerpc/kernel/idle_e500.S
+++ b/arch/powerpc/kernel/idle_e500.S
@@ -58,15 +58,6 @@ BEGIN_FTR_SECTION
 	mtlr	r0
 	lis	r3,HID0_NAP@h
 END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
-BEGIN_FTR_SECTION
-	msync
-	li	r7,L2CSR0_L2FL@l
-	mtspr	SPRN_L2CSR0,r7
-2:
-	mfspr	r7,SPRN_L2CSR0
-	andi.	r4,r7,L2CSR0_L2FL@l
-	bne	2b
-END_FTR_SECTION_IFSET(CPU_FTR_L2CSR|CPU_FTR_CAN_NAP)
 1:
 	/* Go to NAP or DOZE now */
 	mfspr	r4,SPRN_HID0
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index b054f33ab1fb..a8e3490b54e3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -322,11 +322,11 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
-	build_fail = ppc_md.tce_build(tbl, entry, npages,
+	build_fail = tbl->it_ops->set(tbl, entry, npages,
 				      (unsigned long)page &
 				      IOMMU_PAGE_MASK(tbl), direction, attrs);
 
-	/* ppc_md.tce_build() only returns non-zero for transient errors.
+	/* tbl->it_ops->set() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
 	 * DMA_ERROR_CODE. For all other errors the functionality is
 	 * not altered.
@@ -337,8 +337,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	}
 
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	/* Make sure updates are seen by hardware */
 	mb();
@@ -408,7 +408,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	if (!iommu_free_check(tbl, dma_addr, npages))
 		return;
 
-	ppc_md.tce_free(tbl, entry, npages);
+	tbl->it_ops->clear(tbl, entry, npages);
 
 	spin_lock_irqsave(&(pool->lock), flags);
 	bitmap_clear(tbl->it_map, free_entry, npages);
@@ -424,8 +424,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	 * not do an mb() here on purpose, it is not needed on any of
 	 * the current platforms.
 	 */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 }
 
 int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -495,7 +495,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
-		build_fail = ppc_md.tce_build(tbl, entry, npages,
+		build_fail = tbl->it_ops->set(tbl, entry, npages,
 					      vaddr & IOMMU_PAGE_MASK(tbl),
 					      direction, attrs);
 		if(unlikely(build_fail))
@@ -534,8 +534,8 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	}
 
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	DBG("mapped %d elements:\n", outcount);
 
@@ -600,8 +600,8 @@ void ppc_iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 	 * do not do an mb() here, the affected platforms do not need it
 	 * when freeing.
 	 */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 }
 
 static void iommu_table_clear(struct iommu_table *tbl)
@@ -613,17 +613,17 @@ static void iommu_table_clear(struct iommu_table *tbl)
 	 */
 	if (!is_kdump_kernel() || is_fadump_active()) {
 		/* Clear the table in case firmware left allocations in it */
-		ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
+		tbl->it_ops->clear(tbl, tbl->it_offset, tbl->it_size);
 		return;
 	}
 
 #ifdef CONFIG_CRASH_DUMP
-	if (ppc_md.tce_get) {
+	if (tbl->it_ops->get) {
 		unsigned long index, tceval, tcecount = 0;
 
 		/* Reserve the existing mappings left by the first kernel. */
 		for (index = 0; index < tbl->it_size; index++) {
-			tceval = ppc_md.tce_get(tbl, index + tbl->it_offset);
+			tceval = tbl->it_ops->get(tbl, index + tbl->it_offset);
 			/*
 			 * Freed TCE entry contains 0x7fffffffffffffff on JS20
 			 */
@@ -657,6 +657,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	unsigned int i;
 	struct iommu_pool *p;
 
+	BUG_ON(!tbl->it_ops);
+
 	/* number of bytes needed for the bitmap */
 	sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
 
@@ -713,9 +715,11 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	unsigned long bitmap_sz;
 	unsigned int order;
 
-	if (!tbl || !tbl->it_map) {
-		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
-				node_name);
+	if (!tbl)
+		return;
+
+	if (!tbl->it_map) {
+		kfree(tbl);
 		return;
 	}
 
@@ -726,13 +730,6 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
-#ifdef CONFIG_IOMMU_API
-	if (tbl->it_group) {
-		iommu_group_put(tbl->it_group);
-		BUG_ON(tbl->it_group);
-	}
-#endif
-
 	/* verify that table contains no entries */
 	if (!bitmap_empty(tbl->it_map, tbl->it_size))
 		pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
@@ -871,17 +868,33 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 	}
 }
 
+unsigned long iommu_direction_to_tce_perm(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return TCE_PCI_READ | TCE_PCI_WRITE;
+	case DMA_FROM_DEVICE:
+		return TCE_PCI_WRITE;
+	case DMA_TO_DEVICE:
+		return TCE_PCI_READ;
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(iommu_direction_to_tce_perm);
+
 #ifdef CONFIG_IOMMU_API
 /*
  * SPAPR TCE API
  */
 static void group_release(void *iommu_data)
 {
-	struct iommu_table *tbl = iommu_data;
-	tbl->it_group = NULL;
+	struct iommu_table_group *table_group = iommu_data;
+
+	table_group->group = NULL;
 }
 
-void iommu_register_group(struct iommu_table *tbl,
+void iommu_register_group(struct iommu_table_group *table_group,
 		int pci_domain_number, unsigned long pe_num)
 {
 	struct iommu_group *grp;
@@ -893,8 +906,8 @@ void iommu_register_group(struct iommu_table *tbl,
 				PTR_ERR(grp));
 		return;
 	}
-	tbl->it_group = grp;
-	iommu_group_set_iommudata(grp, tbl, group_release);
+	table_group->group = grp;
+	iommu_group_set_iommudata(grp, table_group, group_release);
 	name = kasprintf(GFP_KERNEL, "domain%d-pe%lx",
 			pci_domain_number, pe_num);
 	if (!name)
@@ -919,8 +932,8 @@ EXPORT_SYMBOL_GPL(iommu_tce_direction);
 void iommu_flush_tce(struct iommu_table *tbl)
 {
 	/* Flush/invalidate TLB caches if necessary */
-	if (ppc_md.tce_flush)
-		ppc_md.tce_flush(tbl);
+	if (tbl->it_ops->flush)
+		tbl->it_ops->flush(tbl);
 
 	/* Make sure updates are seen by hardware */
 	mb();
@@ -931,7 +944,7 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce_value,
 		unsigned long npages)
 {
-	/* ppc_md.tce_free() does not support any value but 0 */
+	/* tbl->it_ops->clear() does not support any value but 0 */
 	if (tce_value)
 		return -EINVAL;
 
@@ -952,10 +965,7 @@ EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check);
 int iommu_tce_put_param_check(struct iommu_table *tbl,
 		unsigned long ioba, unsigned long tce)
 {
-	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		return -EINVAL;
-
-	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
+	if (tce & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
 	if (ioba & ~IOMMU_PAGE_MASK(tbl))
@@ -972,68 +982,16 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 }
 EXPORT_SYMBOL_GPL(iommu_tce_put_param_check);
 
-unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
-{
-	unsigned long oldtce;
-	struct iommu_pool *pool = get_pool(tbl, entry);
-
-	spin_lock(&(pool->lock));
-
-	oldtce = ppc_md.tce_get(tbl, entry);
-	if (oldtce & (TCE_PCI_WRITE | TCE_PCI_READ))
-		ppc_md.tce_free(tbl, entry, 1);
-	else
-		oldtce = 0;
-
-	spin_unlock(&(pool->lock));
-
-	return oldtce;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tce);
-
-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
-		unsigned long entry, unsigned long pages)
-{
-	unsigned long oldtce;
-	struct page *page;
-
-	for ( ; pages; --pages, ++entry) {
-		oldtce = iommu_clear_tce(tbl, entry);
-		if (!oldtce)
-			continue;
-
-		page = pfn_to_page(oldtce >> PAGE_SHIFT);
-		WARN_ON(!page);
-		if (page) {
-			if (oldtce & TCE_PCI_WRITE)
-				SetPageDirty(page);
-			put_page(page);
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
-
-/*
- * hwaddr is a kernel virtual address here (0xc... bazillion),
- * tce_build converts it to a physical address.
- */
-int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-		unsigned long hwaddr, enum dma_data_direction direction)
+long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
+		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	int ret = -EBUSY;
-	unsigned long oldtce;
-	struct iommu_pool *pool = get_pool(tbl, entry);
-
-	spin_lock(&(pool->lock));
+	long ret;
 
-	oldtce = ppc_md.tce_get(tbl, entry);
-	/* Add new entry if it is not busy */
-	if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
-		ret = ppc_md.tce_build(tbl, entry, 1, hwaddr, direction, NULL);
+	ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
-	spin_unlock(&(pool->lock));
+	if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+			(*direction == DMA_BIDIRECTIONAL)))
+		SetPageDirty(pfn_to_page(*hpa >> PAGE_SHIFT));
 
 	/* if (unlikely(ret))
 		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
@@ -1042,84 +1000,72 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(iommu_tce_build);
+EXPORT_SYMBOL_GPL(iommu_tce_xchg);
 
-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
-		unsigned long tce)
+int iommu_take_ownership(struct iommu_table *tbl)
 {
-	int ret;
-	struct page *page = NULL;
-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
-	enum dma_data_direction direction = iommu_tce_direction(tce);
-
-	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
-			direction != DMA_TO_DEVICE, &page);
-	if (unlikely(ret != 1)) {
-		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
-				tce, entry << tbl->it_page_shift, ret); */
-		return -EFAULT;
-	}
-	hwaddr = (unsigned long) page_address(page) + offset;
-
-	ret = iommu_tce_build(tbl, entry, hwaddr, direction);
-	if (ret)
-		put_page(page);
-
-	if (ret < 0)
-		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
-			__func__, entry << tbl->it_page_shift, tce, ret);
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+	int ret = 0;
 
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+	/*
+	 * VFIO does not control TCE entries allocation and the guest
+	 * can write new TCEs on top of existing ones so iommu_tce_build()
+	 * must be able to release old pages. This functionality
+	 * requires exchange() callback defined so if it is not
+	 * implemented, we disallow taking ownership over the table.
+	 */
+	if (!tbl->it_ops->exchange)
+		return -EINVAL;
 
-int iommu_take_ownership(struct iommu_table *tbl)
-{
-	unsigned long sz = (tbl->it_size + 7) >> 3;
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock(&tbl->pools[i].lock);
 
 	if (tbl->it_offset == 0)
 		clear_bit(0, tbl->it_map);
 
 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
 		pr_err("iommu_tce: it_map is not empty");
-		return -EBUSY;
+		ret = -EBUSY;
+		/* Restore bit#0 set by iommu_init_table() */
+		if (tbl->it_offset == 0)
+			set_bit(0, tbl->it_map);
+	} else {
+		memset(tbl->it_map, 0xff, sz);
 	}
 
-	memset(tbl->it_map, 0xff, sz);
-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 
-	/*
-	 * Disable iommu bypass, otherwise the user can DMA to all of
-	 * our physical memory via the bypass window instead of just
-	 * the pages that has been explicitly mapped into the iommu
-	 */
-	if (tbl->set_bypass)
-		tbl->set_bypass(tbl, false);
-
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_take_ownership);
 
 void iommu_release_ownership(struct iommu_table *tbl)
 {
-	unsigned long sz = (tbl->it_size + 7) >> 3;
+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+
+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_lock(&tbl->pools[i].lock);
 
-	iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
 	memset(tbl->it_map, 0, sz);
 
 	/* Restore bit#0 set by iommu_init_table() */
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
 
-	/* The kernel owns the device now, we can restore the iommu bypass */
-	if (tbl->set_bypass)
-		tbl->set_bypass(tbl, true);
+	for (i = 0; i < tbl->nr_pools; i++)
+		spin_unlock(&tbl->pools[i].lock);
+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
 int iommu_add_device(struct device *dev)
 {
 	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
 
 	/*
 	 * The sysfs entries should be populated before
@@ -1137,15 +1083,22 @@ int iommu_add_device(struct device *dev)
 	}
 
 	tbl = get_iommu_table_base(dev);
-	if (!tbl || !tbl->it_group) {
+	if (!tbl) {
 		pr_debug("%s: Skipping device %s with no tbl\n",
 			 __func__, dev_name(dev));
 		return 0;
 	}
 
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
+	if (!tgl) {
+		pr_debug("%s: Skipping device %s with no group\n",
+			 __func__, dev_name(dev));
+		return 0;
+	}
 	pr_debug("%s: Adding %s to iommu group %d\n",
 		 __func__, dev_name(dev),
-		 iommu_group_id(tbl->it_group));
+		 iommu_group_id(tgl->table_group->group));
 
 	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
 		pr_err("%s: Invalid IOMMU page size %lx (%lx) on %s\n",
@@ -1154,7 +1107,7 @@ int iommu_add_device(struct device *dev)
 		return -EINVAL;
 	}
 
-	return iommu_group_add_device(tbl->it_group, dev);
+	return iommu_group_add_device(tgl->table_group->group, dev);
 }
 EXPORT_SYMBOL_GPL(iommu_add_device);
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 15c99b649b04..b2eb4686bd8f 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -73,7 +73,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 		    uint64_t nip, uint64_t addr)
 {
 	uint64_t srr1;
-	int index = __this_cpu_inc_return(mce_nest_count);
+	int index = __this_cpu_inc_return(mce_nest_count) - 1;
 	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
 
 	/*
@@ -184,7 +184,7 @@ void machine_check_queue_event(void)
 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 		return;
 
-	index = __this_cpu_inc_return(mce_queue_count);
+	index = __this_cpu_inc_return(mce_queue_count) - 1;
 	/* If queue is full, just return for now. */
 	if (index >= MAX_MC_EVT) {
 		__this_cpu_dec(mce_queue_count);
diff --git a/arch/powerpc/kernel/msi.c b/arch/powerpc/kernel/msi.c
index 71bd161640cf..dab616a33b8d 100644
--- a/arch/powerpc/kernel/msi.c
+++ b/arch/powerpc/kernel/msi.c
@@ -15,7 +15,10 @@
 
 int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	if (!ppc_md.setup_msi_irqs || !ppc_md.teardown_msi_irqs) {
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (!phb->controller_ops.setup_msi_irqs ||
+	    !phb->controller_ops.teardown_msi_irqs) {
 		pr_debug("msi: Platform doesn't provide MSI callbacks.\n");
 		return -ENOSYS;
 	}
@@ -24,10 +27,12 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
-	return ppc_md.setup_msi_irqs(dev, nvec, type);
+	return phb->controller_ops.setup_msi_irqs(dev, nvec, type);
 }
 
 void arch_teardown_msi_irqs(struct pci_dev *dev)
 {
-	ppc_md.teardown_msi_irqs(dev);
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	phb->controller_ops.teardown_msi_irqs(dev);
 }
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 0d054068a21d..b9de34d44fcb 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -89,6 +89,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev)
 #endif
 	return phb;
 }
+EXPORT_SYMBOL_GPL(pcibios_alloc_controller);
 
 void pcibios_free_controller(struct pci_controller *phb)
 {
@@ -1447,6 +1448,7 @@ void pcibios_claim_one_bus(struct pci_bus *bus)
 	list_for_each_entry(child_bus, &bus->children, node)
 		pcibios_claim_one_bus(child_bus);
 }
+EXPORT_SYMBOL_GPL(pcibios_claim_one_bus);
 
 
 /* pcibios_finish_adding_to_bus
@@ -1488,6 +1490,14 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return pci_enable_resources(dev, mask);
 }
 
+void pcibios_disable_device(struct pci_dev *dev)
+{
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
+	if (phb->controller_ops.disable_device)
+		phb->controller_ops.disable_device(dev);
+}
+
 resource_size_t pcibios_io_space_offset(struct pci_controller *hose)
 {
 	return (unsigned long) hose->io_base_virt - _IO_BASE;
@@ -1680,6 +1690,7 @@ void pcibios_scan_phb(struct pci_controller *hose)
 			pcie_bus_configure_settings(child);
 	}
 }
+EXPORT_SYMBOL_GPL(pcibios_scan_phb);
 
 static void fixup_hide_host_resource_fsl(struct pci_dev *dev)
 {
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 7ed85a69a9c2..7f9ed0c1f6b9 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -29,7 +29,12 @@
  */
 void pcibios_release_device(struct pci_dev *dev)
 {
+	struct pci_controller *phb = pci_bus_to_host(dev->bus);
+
 	eeh_remove_device(dev);
+
+	if (phb->controller_ops.release_device)
+		phb->controller_ops.release_device(dev);
 }
 
 /**
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index febb50dd5328..8005e18d1b40 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1112,7 +1112,6 @@ static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
 /*
  * Copy a thread..
  */
-extern unsigned long dscr_default; /* defined in arch/powerpc/kernel/sysfs.c */
 
 /*
  * Copy architecture-specific thread state
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 308c5e15676b..8b888b12a475 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -46,7 +46,6 @@
 #include <asm/mmu.h>
 #include <asm/paca.h>
 #include <asm/pgtable.h>
-#include <asm/pci.h>
 #include <asm/iommu.h>
 #include <asm/btext.h>
 #include <asm/sections.h>
@@ -573,6 +572,7 @@ static void __init early_reserve_mem_dt(void)
 	int len;
 	const __be32 *prop;
 
+	early_init_fdt_reserve_self();
 	early_init_fdt_scan_reserved_mem();
 
 	dt_root = of_get_flat_dt_root();
@@ -800,6 +800,7 @@ int of_get_ibm_chip_id(struct device_node *np)
 	}
 	return -1;
 }
+EXPORT_SYMBOL(of_get_ibm_chip_id);
 
 /**
  * cpu_to_chip_id - Return the cpus chip-id
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index fd1fe4c37599..fcca8077e6a2 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -37,7 +37,6 @@
 #include <asm/smp.h>
 #include <asm/mmu.h>
 #include <asm/pgtable.h>
-#include <asm/pci.h>
 #include <asm/iommu.h>
 #include <asm/btext.h>
 #include <asm/sections.h>
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index c69671c03c3b..bdcbb716f4d6 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -523,7 +523,8 @@ void __init setup_system(void)
 	smp_release_cpus();
 #endif
 
-	pr_info("Starting Linux PPC64 %s\n", init_utsname()->version);
+	pr_info("Starting Linux %s %s\n", init_utsname()->machine,
+		 init_utsname()->version);
 
 	pr_info("-----------------------------------------------------\n");
 	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
@@ -686,6 +687,9 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_PPC_64K_PAGES
 	init_mm.context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(&init_mm.context);
+#endif
 	irqstack_early_init();
 	exc_lvl_early_init();
 	emergency_stack_init();
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index fa1fd8a0c867..692873bff334 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -496,13 +496,34 @@ static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
 static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);
 
+/*
+ * This is the system wide DSCR register default value. Any
+ * change to this default value through the sysfs interface
+ * will update all per cpu DSCR default values across the
+ * system stored in their respective PACA structures.
+ */
 static unsigned long dscr_default;
 
+/**
+ * read_dscr() - Fetch the cpu specific DSCR default
+ * @val:	Returned cpu specific DSCR default value
+ *
+ * This function returns the per cpu DSCR default value
+ * for any cpu which is contained in it's PACA structure.
+ */
 static void read_dscr(void *val)
 {
 	*(unsigned long *)val = get_paca()->dscr_default;
 }
 
+
+/**
+ * write_dscr() - Update the cpu specific DSCR default
+ * @val:	New cpu specific DSCR default value to update
+ *
+ * This function updates the per cpu DSCR default value
+ * for any cpu which is contained in it's PACA structure.
+ */
 static void write_dscr(void *val)
 {
 	get_paca()->dscr_default = *(unsigned long *)val;
@@ -520,12 +541,29 @@ static void add_write_permission_dev_attr(struct device_attribute *attr)
 	attr->attr.mode |= 0200;
 }
 
+/**
+ * show_dscr_default() - Fetch the system wide DSCR default
+ * @dev:	Device structure
+ * @attr:	Device attribute structure
+ * @buf:	Interface buffer
+ *
+ * This function returns the system wide DSCR default value.
+ */
 static ssize_t show_dscr_default(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	return sprintf(buf, "%lx\n", dscr_default);
 }
 
+/**
+ * store_dscr_default() - Update the system wide DSCR default
+ * @dev:	Device structure
+ * @attr:	Device attribute structure
+ * @buf:	Interface buffer
+ * @count:	Size of the update
+ *
+ * This function updates the system wide DSCR default value.
+ */
 static ssize_t __used store_dscr_default(struct device *dev,
 		struct device_attribute *attr, const char *buf,
 		size_t count)
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 5754b226da7e..bf8f34a58670 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -293,7 +293,7 @@ dont_backup_fp:
 	ld	r2, STK_GOT(r1)
 
 	/* Load CPU's default DSCR */
-	ld	r0, PACA_DSCR(r13)
+	ld	r0, PACA_DSCR_DEFAULT(r13)
 	mtspr	SPRN_DSCR, r0
 
 	blr
@@ -473,7 +473,7 @@ restore_gprs:
 	ld	r2, STK_GOT(r1)
 
 	/* Load CPU's default DSCR */
-	ld	r0, PACA_DSCR(r13)
+	ld	r0, PACA_DSCR_DEFAULT(r13)
 	mtspr	SPRN_DSCR, r0
 
 	blr
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 19e4744b6eba..6530f1b8874d 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1377,6 +1377,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
 	};
 	char *facility = "unknown";
 	u64 value;
+	u32 instword, rd;
 	u8 status;
 	bool hv;
 
@@ -1388,12 +1389,46 @@ void facility_unavailable_exception(struct pt_regs *regs)
 
 	status = value >> 56;
 	if (status == FSCR_DSCR_LG) {
-		/* User is acessing the DSCR.  Set the inherit bit and allow
-		 * the user to set it directly in future by setting via the
-		 * FSCR DSCR bit.  We always leave HFSCR DSCR set.
+		/*
+		 * User is accessing the DSCR register using the problem
+		 * state only SPR number (0x03) either through a mfspr or
+		 * a mtspr instruction. If it is a write attempt through
+		 * a mtspr, then we set the inherit bit. This also allows
+		 * the user to write or read the register directly in the
+		 * future by setting via the FSCR DSCR bit. But in case it
+		 * is a read DSCR attempt through a mfspr instruction, we
+		 * just emulate the instruction instead. This code path will
+		 * always emulate all the mfspr instructions till the user
+		 * has attempted atleast one mtspr instruction. This way it
+		 * preserves the same behaviour when the user is accessing
+		 * the DSCR through privilege level only SPR number (0x11)
+		 * which is emulated through illegal instruction exception.
+		 * We always leave HFSCR DSCR set.
 		 */
-		current->thread.dscr_inherit = 1;
-		mtspr(SPRN_FSCR, value | FSCR_DSCR);
+		if (get_user(instword, (u32 __user *)(regs->nip))) {
+			pr_err("Failed to fetch the user instruction\n");
+			return;
+		}
+
+		/* Write into DSCR (mtspr 0x03, RS) */
+		if ((instword & PPC_INST_MTSPR_DSCR_USER_MASK)
+				== PPC_INST_MTSPR_DSCR_USER) {
+			rd = (instword >> 21) & 0x1f;
+			current->thread.dscr = regs->gpr[rd];
+			current->thread.dscr_inherit = 1;
+			mtspr(SPRN_FSCR, value | FSCR_DSCR);
+		}
+
+		/* Read from DSCR (mfspr RT, 0x03) */
+		if ((instword & PPC_INST_MFSPR_DSCR_USER_MASK)
+				== PPC_INST_MFSPR_DSCR_USER) {
+			if (emulate_instruction(regs)) {
+				pr_err("DSCR based mfspr emulation failed\n");
+				return;
+			}
+			regs->nip += 4;
+			emulate_single_step(regs);
+		}
 		return;
 	}
 
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 305eb0d9b768..b457bfa28436 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -49,13 +49,16 @@
 /* The alignment of the vDSO */
 #define VDSO_ALIGNMENT	(1 << 16)
 
-extern char vdso32_start, vdso32_end;
-static void *vdso32_kbase = &vdso32_start;
 static unsigned int vdso32_pages;
+static void *vdso32_kbase;
 static struct page **vdso32_pagelist;
 unsigned long vdso32_sigtramp;
 unsigned long vdso32_rt_sigtramp;
 
+#ifdef CONFIG_VDSO32
+extern char vdso32_start, vdso32_end;
+#endif
+
 #ifdef CONFIG_PPC64
 extern char vdso64_start, vdso64_end;
 static void *vdso64_kbase = &vdso64_start;
@@ -140,50 +143,6 @@ struct lib64_elfinfo
 };
 
 
-#ifdef __DEBUG
-static void dump_one_vdso_page(struct page *pg, struct page *upg)
-{
-	printk("kpg: %p (c:%d,f:%08lx)", __va(page_to_pfn(pg) << PAGE_SHIFT),
-	       page_count(pg),
-	       pg->flags);
-	if (upg && !IS_ERR(upg) /* && pg != upg*/) {
-		printk(" upg: %p (c:%d,f:%08lx)", __va(page_to_pfn(upg)
-						       << PAGE_SHIFT),
-		       page_count(upg),
-		       upg->flags);
-	}
-	printk("\n");
-}
-
-static void dump_vdso_pages(struct vm_area_struct * vma)
-{
-	int i;
-
-	if (!vma || is_32bit_task()) {
-		printk("vDSO32 @ %016lx:\n", (unsigned long)vdso32_kbase);
-		for (i=0; i<vdso32_pages; i++) {
-			struct page *pg = virt_to_page(vdso32_kbase +
-						       i*PAGE_SIZE);
-			struct page *upg = (vma && vma->vm_mm) ?
-				follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
-				: NULL;
-			dump_one_vdso_page(pg, upg);
-		}
-	}
-	if (!vma || !is_32bit_task()) {
-		printk("vDSO64 @ %016lx:\n", (unsigned long)vdso64_kbase);
-		for (i=0; i<vdso64_pages; i++) {
-			struct page *pg = virt_to_page(vdso64_kbase +
-						       i*PAGE_SIZE);
-			struct page *upg = (vma && vma->vm_mm) ?
-				follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
-				: NULL;
-			dump_one_vdso_page(pg, upg);
-		}
-	}
-}
-#endif /* DEBUG */
-
 /*
  * This is called from binfmt_elf, we create the special vma for the
  * vDSO and insert it into the mm struct tree
@@ -292,6 +251,7 @@ const char *arch_vma_name(struct vm_area_struct *vma)
 
 
 
+#ifdef CONFIG_VDSO32
 static void * __init find_section32(Elf32_Ehdr *ehdr, const char *secname,
 				  unsigned long *size)
 {
@@ -379,6 +339,20 @@ static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32,
 
 	return 0;
 }
+#else /* !CONFIG_VDSO32 */
+static unsigned long __init find_function32(struct lib32_elfinfo *lib,
+					    const char *symname)
+{
+	return 0;
+}
+
+static int __init vdso_do_func_patch32(struct lib32_elfinfo *v32,
+				       struct lib64_elfinfo *v64,
+				       const char *orig, const char *fix)
+{
+	return 0;
+}
+#endif /* CONFIG_VDSO32 */
 
 
 #ifdef CONFIG_PPC64
@@ -489,6 +463,7 @@ static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
 	 * Locate symbol tables & text section
 	 */
 
+#ifdef CONFIG_VDSO32
 	v32->dynsym = find_section32(v32->hdr, ".dynsym", &v32->dynsymsize);
 	v32->dynstr = find_section32(v32->hdr, ".dynstr", NULL);
 	if (v32->dynsym == NULL || v32->dynstr == NULL) {
@@ -501,6 +476,7 @@ static __init int vdso_do_find_sections(struct lib32_elfinfo *v32,
 		return -1;
 	}
 	v32->text = sect - vdso32_kbase;
+#endif
 
 #ifdef CONFIG_PPC64
 	v64->dynsym = find_section64(v64->hdr, ".dynsym", &v64->dynsymsize);
@@ -537,7 +513,9 @@ static __init void vdso_setup_trampolines(struct lib32_elfinfo *v32,
 static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 				       struct lib64_elfinfo *v64)
 {
+#ifdef CONFIG_VDSO32
 	Elf32_Sym *sym32;
+#endif
 #ifdef CONFIG_PPC64
 	Elf64_Sym *sym64;
 
@@ -552,6 +530,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 		(sym64->st_value - VDSO64_LBASE);
 #endif /* CONFIG_PPC64 */
 
+#ifdef CONFIG_VDSO32
 	sym32 = find_symbol32(v32, "__kernel_datapage_offset");
 	if (sym32 == NULL) {
 		printk(KERN_ERR "vDSO32: Can't find symbol "
@@ -561,6 +540,7 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 	*((int *)(vdso32_kbase + (sym32->st_value - VDSO32_LBASE))) =
 		(vdso32_pages << PAGE_SHIFT) -
 		(sym32->st_value - VDSO32_LBASE);
+#endif
 
 	return 0;
 }
@@ -569,55 +549,54 @@ static __init int vdso_fixup_datapage(struct lib32_elfinfo *v32,
 static __init int vdso_fixup_features(struct lib32_elfinfo *v32,
 				      struct lib64_elfinfo *v64)
 {
-	void *start32;
-	unsigned long size32;
+	unsigned long size;
+	void *start;
 
 #ifdef CONFIG_PPC64
-	void *start64;
-	unsigned long size64;
-
-	start64 = find_section64(v64->hdr, "__ftr_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->cpu_features,
-				  start64, start64 + size64);
+				  start, start + size);
 
-	start64 = find_section64(v64->hdr, "__mmu_ftr_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__mmu_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->mmu_features,
-				  start64, start64 + size64);
+				  start, start + size);
 
-	start64 = find_section64(v64->hdr, "__fw_ftr_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__fw_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(powerpc_firmware_features,
-				  start64, start64 + size64);
+				  start, start + size);
 
-	start64 = find_section64(v64->hdr, "__lwsync_fixup", &size64);
-	if (start64)
+	start = find_section64(v64->hdr, "__lwsync_fixup", &size);
+	if (start)
 		do_lwsync_fixups(cur_cpu_spec->cpu_features,
-				 start64, start64 + size64);
+				 start, start + size);
 #endif /* CONFIG_PPC64 */
 
-	start32 = find_section32(v32->hdr, "__ftr_fixup", &size32);
-	if (start32)
+#ifdef CONFIG_VDSO32
+	start = find_section32(v32->hdr, "__ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->cpu_features,
-				  start32, start32 + size32);
+				  start, start + size);
 
-	start32 = find_section32(v32->hdr, "__mmu_ftr_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__mmu_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(cur_cpu_spec->mmu_features,
-				  start32, start32 + size32);
+				  start, start + size);
 
 #ifdef CONFIG_PPC64
-	start32 = find_section32(v32->hdr, "__fw_ftr_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__fw_ftr_fixup", &size);
+	if (start)
 		do_feature_fixups(powerpc_firmware_features,
-				  start32, start32 + size32);
+				  start, start + size);
 #endif /* CONFIG_PPC64 */
 
-	start32 = find_section32(v32->hdr, "__lwsync_fixup", &size32);
-	if (start32)
+	start = find_section32(v32->hdr, "__lwsync_fixup", &size);
+	if (start)
 		do_lwsync_fixups(cur_cpu_spec->cpu_features,
-				 start32, start32 + size32);
+				 start, start + size);
+#endif
 
 	return 0;
 }
@@ -779,11 +758,15 @@ static int __init vdso_init(void)
 #endif /* CONFIG_PPC64 */
 
 
+#ifdef CONFIG_VDSO32
+	vdso32_kbase = &vdso32_start;
+
 	/*
 	 * Calculate the size of the 32 bits vDSO
 	 */
 	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
 	DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
+#endif
 
 
 	/*
@@ -804,6 +787,7 @@ static int __init vdso_init(void)
 		return 0;
 	}
 
+#ifdef CONFIG_VDSO32
 	/* Make sure pages are in the correct state */
 	vdso32_pagelist = kzalloc(sizeof(struct page *) * (vdso32_pages + 2),
 				  GFP_KERNEL);
@@ -816,6 +800,7 @@ static int __init vdso_init(void)
 	}
 	vdso32_pagelist[i++] = virt_to_page(vdso_data);
 	vdso32_pagelist[i] = NULL;
+#endif
 
 #ifdef CONFIG_PPC64
 	vdso64_pagelist = kzalloc(sizeof(struct page *) * (vdso64_pages + 2),
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 5bfdab9047be..b41426c60ef6 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1196,6 +1196,11 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 	tbl->it_type = TCE_VB;
 	tbl->it_blocksize = 16;
 
+	if (firmware_has_feature(FW_FEATURE_LPAR))
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+	else
+		tbl->it_ops = &iommu_table_pseries_ops;
+
 	return iommu_init_table(tbl, -1);
 }
 
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index f096e72262f4..1db685104ffc 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -213,6 +213,7 @@ SECTIONS
 		*(.opd)
 	}
 
+	. = ALIGN(256);
 	.got : AT(ADDR(.got) - LOAD_OFFSET) {
 		__toc_start = .;
 #ifndef CONFIG_RELOCATABLE
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 453a8a47a467..05ea8fc7f829 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -757,16 +757,17 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem)
+				const struct kvm_userspace_memory_region *mem)
 {
 	return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
-	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old);
+	kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
 }
 
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 1a4acf8bf4f4..dab68b7af3f2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -650,7 +650,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm)
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&kvm->srcu);
-	slots = kvm->memslots;
+	slots = kvm_memslots(kvm);
 	kvm_for_each_memslot(memslot, slots) {
 		/*
 		 * This assumes it is acceptable to lose reference and
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 48d3c5d2ecc9..68d067ad4222 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1952,7 +1952,7 @@ static void post_guest_process(struct kvmppc_vcore *vc)
  */
 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu, *vnext;
 	int i;
 	int srcu_idx;
 
@@ -1982,7 +1982,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 */
 	if ((threads_per_core > 1) &&
 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
+		list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+					 arch.run_list) {
 			vcpu->arch.ret = -EBUSY;
 			kvmppc_remove_runnable(vc, vcpu);
 			wake_up(&vcpu->arch.cpu_run);
@@ -2320,6 +2321,7 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 					 struct kvm_dirty_log *log)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 	int r;
 	unsigned long n;
@@ -2330,7 +2332,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 	if (log->slot >= KVM_USER_MEM_SLOTS)
 		goto out;
 
-	memslot = id_to_memslot(kvm->memslots, log->slot);
+	slots = kvm_memslots(kvm);
+	memslot = id_to_memslot(slots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
@@ -2373,16 +2376,18 @@ static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 
 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
 					struct kvm_memory_slot *memslot,
-					struct kvm_userspace_memory_region *mem)
+					const struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 
 	if (npages && old->npages) {
@@ -2392,7 +2397,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 		 * since the rmap array starts out as all zeroes,
 		 * i.e. no pages are dirty.
 		 */
-		memslot = id_to_memslot(kvm->memslots, mem->slot);
+		slots = kvm_memslots(kvm);
+		memslot = id_to_memslot(slots, mem->slot);
 		kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
 	}
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 4d70df26c402..faa86e9c0551 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -324,7 +324,7 @@ kvm_start_guest:
 kvm_secondary_got_guest:
 
 	/* Set HSTATE_DSCR(r13) to something sensible */
-	ld	r6, PACA_DSCR(r13)
+	ld	r6, PACA_DSCR_DEFAULT(r13)
 	std	r6, HSTATE_DSCR(r13)
 
 	/* Order load of vcore, ptid etc. after load of vcpu */
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f57383941d03..64891b081ad5 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1530,6 +1530,7 @@ out:
 static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
 					 struct kvm_dirty_log *log)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 	struct kvm_vcpu *vcpu;
 	ulong ga, ga_end;
@@ -1545,7 +1546,8 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm,
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
-		memslot = id_to_memslot(kvm->memslots, log->slot);
+		slots = kvm_memslots(kvm);
+		memslot = id_to_memslot(slots, log->slot);
 
 		ga = memslot->base_gfn << PAGE_SHIFT;
 		ga_end = ga + (memslot->npages << PAGE_SHIFT);
@@ -1571,14 +1573,15 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm,
 
 static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm,
 					struct kvm_memory_slot *memslot,
-					struct kvm_userspace_memory_region *mem)
+					const struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
 	return;
 }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6c1316a15a27..cc5842657161 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1004,10 +1004,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	}
 
-	local_irq_enable();
-
 	trace_kvm_exit(exit_nr, vcpu);
-	kvm_guest_exit();
+	__kvm_guest_exit();
+
+	local_irq_enable();
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
 	run->ready_for_interrupt_injection = 1;
@@ -1784,14 +1784,15 @@ int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot,
-				      struct kvm_userspace_memory_region *mem)
+				      const struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
-				const struct kvm_memory_slot *old)
+				const struct kvm_userspace_memory_region *mem,
+				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new)
 {
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index ac3ddf115f3d..e5dde32fe71f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -115,7 +115,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			continue;
 		}
 
-		kvm_guest_enter();
+		__kvm_guest_enter();
 		return 1;
 	}
 
@@ -595,18 +595,19 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   const struct kvm_memory_slot *old,
+				   const struct kvm_memory_slot *new,
 				   enum kvm_mr_change change)
 {
-	kvmppc_core_commit_memory_region(kvm, mem, old);
+	kvmppc_core_commit_memory_region(kvm, mem, old, new);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7902802a19a5..a47e14277fd8 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -33,6 +33,6 @@ obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
 obj-$(CONFIG_FTR_FIXUP_SELFTEST) += feature-fixups-test.o
 
 obj-$(CONFIG_ALTIVEC)	+= xor_vmx.o
-CFLAGS_xor_vmx.o += -maltivec -mabi=altivec
+CFLAGS_xor_vmx.o += -maltivec $(call cc-option,-mabi=altivec)
 
 obj-$(CONFIG_PPC64) += $(obj64-y)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index 3cf529ceec5b..ac93a3bd2730 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -27,11 +27,11 @@ int enter_vmx_usercopy(void)
 	if (in_interrupt())
 		return 0;
 
-	/* This acts as preempt_disable() as well and will make
-	 * enable_kernel_altivec(). We need to disable page faults
-	 * as they can call schedule and thus make us lose the VMX
-	 * context. So on page faults, we just fail which will cause
-	 * a fallback to the normal non-vmx copy.
+	preempt_disable();
+	/*
+	 * We need to disable page faults as they can call schedule and
+	 * thus make us lose the VMX context. So on page faults, we just
+	 * fail which will cause a fallback to the normal non-vmx copy.
 	 */
 	pagefault_disable();
 
@@ -47,6 +47,7 @@ int enter_vmx_usercopy(void)
 int exit_vmx_usercopy(void)
 {
 	pagefault_enable();
+	preempt_enable();
 	return 0;
 }
 
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 9c8770b5f96f..3eb73a38220d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)		+= highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
+obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_iommu.o
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index f031a47d7701..6527882ce05e 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -26,7 +26,7 @@
 #include <asm/reg.h>
 #include <asm/copro.h>
 #include <asm/spu.h>
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 /*
  * This ought to be kept in sync with the powerpc specific do_page_fault
@@ -100,7 +100,7 @@ EXPORT_SYMBOL_GPL(copro_handle_mm_fault);
 
 int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 {
-	u64 vsid;
+	u64 vsid, vsidkey;
 	int psize, ssize;
 
 	switch (REGION_ID(ea)) {
@@ -109,6 +109,7 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 		psize = get_slice_psize(mm, ea);
 		ssize = user_segment_size(ea);
 		vsid = get_vsid(mm->context.id, ea, ssize);
+		vsidkey = SLB_VSID_USER;
 		break;
 	case VMALLOC_REGION_ID:
 		pr_devel("%s: 0x%llx -- VMALLOC_REGION_ID\n", __func__, ea);
@@ -118,19 +119,21 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
 			psize = mmu_io_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
 		break;
 	case KERNEL_REGION_ID:
 		pr_devel("%s: 0x%llx -- KERNEL_REGION_ID\n", __func__, ea);
 		psize = mmu_linear_psize;
 		ssize = mmu_kernel_ssize;
 		vsid = get_kernel_vsid(ea, mmu_kernel_ssize);
+		vsidkey = SLB_VSID_KERNEL;
 		break;
 	default:
 		pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
 		return 1;
 	}
 
-	vsid = (vsid << slb_vsid_shift(ssize)) | SLB_VSID_USER;
+	vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey;
 
 	vsid |= mmu_psize_defs[psize].sllp |
 		((ssize == MMU_SEGSIZE_1T) ? SLB_VSID_B_1T : 0);
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868d2aa7..6d535973b200 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,13 +33,13 @@
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
 #include <linux/hugetlb.h>
+#include <linux/uaccess.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mmu.h>
 #include <asm/mmu_context.h>
-#include <asm/uaccess.h>
 #include <asm/tlbflush.h>
 #include <asm/siginfo.h>
 #include <asm/debug.h>
@@ -272,15 +272,16 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!arch_irq_disabled_regs(regs))
 		local_irq_enable();
 
-	if (in_atomic() || mm == NULL) {
+	if (faulthandler_disabled() || mm == NULL) {
 		if (!user_mode(regs)) {
 			rc = SIGSEGV;
 			goto bail;
 		}
-		/* in_atomic() in user mode is really bad,
+		/* faulthandler_disabled() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with "
-		       "in_atomic() = %d mm = %p\n", in_atomic(), mm);
+		       "faulthandler_disabled() = %d mm = %p\n",
+		       faulthandler_disabled(), mm);
 		printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
 		       regs->nip, regs->msr);
 		die("Weird page fault", regs, SIGSEGV);
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 9c4880ddecd6..13befa35d8a8 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -29,7 +29,7 @@
 #include <asm/kexec.h>
 #include <asm/ppc-opcode.h>
 
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #ifdef DEBUG_LOW
 #define DBG_LOW(fmt...) udbg_printf(fmt)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index fda236f908eb..5ec987f65b2c 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -57,6 +57,7 @@
 #include <asm/fadump.h>
 #include <asm/firmware.h>
 #include <asm/tm.h>
+#include <asm/trace.h>
 
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
@@ -1004,6 +1005,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 
 	DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
 		ea, access, trap);
+	trace_hash_fault(ea, access, trap);
 
 	/* Get region & vsid */
  	switch (REGION_ID(ea)) {
@@ -1475,7 +1477,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
 	unsigned long hash;
 	unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
 	unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
-	unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
+	unsigned long mode = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL));
 	long ret;
 
 	hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c
index e7450bdbe83a..e292c8a60952 100644
--- a/arch/powerpc/mm/highmem.c
+++ b/arch/powerpc/mm/highmem.c
@@ -34,7 +34,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -59,6 +59,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < __fix_to_virt(FIX_KMAP_END)) {
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -82,5 +83,6 @@ void __kunmap_atomic(void *kvaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0ce968b00b7c..3385e3d0506e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -689,27 +689,34 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
-	pte_t *ptep;
-	struct page *page;
+	pte_t *ptep, pte;
 	unsigned shift;
 	unsigned long mask, flags;
+	struct page *page = ERR_PTR(-EINVAL);
+
+	local_irq_save(flags);
+	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
+	if (!ptep)
+		goto no_page;
+	pte = READ_ONCE(*ptep);
 	/*
+	 * Verify it is a huge page else bail.
 	 * Transparent hugepages are handled by generic code. We can skip them
 	 * here.
 	 */
-	local_irq_save(flags);
-	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
+	if (!shift || pmd_trans_huge(__pmd(pte_val(pte))))
+		goto no_page;
 
-	/* Verify it is a huge page else bail. */
-	if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) {
-		local_irq_restore(flags);
-		return ERR_PTR(-EINVAL);
+	if (!pte_present(pte)) {
+		page = NULL;
+		goto no_page;
 	}
 	mask = (1UL << shift) - 1;
-	page = pte_page(*ptep);
+	page = pte_page(pte);
 	if (page)
 		page += (address & mask) / PAGE_SIZE;
 
+no_page:
 	local_irq_restore(flags);
 	return page;
 }
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 45fda71feb27..0f11819d8f1d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -560,7 +560,7 @@ subsys_initcall(add_system_ram_resources);
  */
 int devmem_is_allowed(unsigned long pfn)
 {
-	if (iomem_is_exclusive(pfn << PAGE_SHIFT))
+	if (iomem_is_exclusive(PFN_PHYS(pfn)))
 		return 0;
 	if (!page_is_ram(pfn))
 		return 1;
diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
index 178876aef40f..4e4efbc2658e 100644
--- a/arch/powerpc/mm/mmu_context_hash64.c
+++ b/arch/powerpc/mm/mmu_context_hash64.c
@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 #ifdef CONFIG_PPC_64K_PAGES
 	mm->context.pte_frag = NULL;
 #endif
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_init(&mm->context);
+#endif
 	return 0;
 }
 
@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
 
 void destroy_context(struct mm_struct *mm)
 {
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+	mm_iommu_cleanup(&mm->context);
+#endif
 
 #ifdef CONFIG_PPC_ICSWX
 	drop_cop(mm->context.acop, mm);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
new file mode 100644
index 000000000000..da6a2168ae9e
--- /dev/null
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -0,0 +1,316 @@
+/*
+ *  IOMMU helpers in MMU context.
+ *
+ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/rculist.h>
+#include <linux/vmalloc.h>
+#include <linux/mutex.h>
+#include <asm/mmu_context.h>
+
+static DEFINE_MUTEX(mem_list_mutex);
+
+struct mm_iommu_table_group_mem_t {
+	struct list_head next;
+	struct rcu_head rcu;
+	unsigned long used;
+	atomic64_t mapped;
+	u64 ua;			/* userspace address */
+	u64 entries;		/* number of entries in hpas[] */
+	u64 *hpas;		/* vmalloc'ed */
+};
+
+static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+		unsigned long npages, bool incr)
+{
+	long ret = 0, locked, lock_limit;
+
+	if (!npages)
+		return 0;
+
+	down_write(&mm->mmap_sem);
+
+	if (incr) {
+		locked = mm->locked_vm + npages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			mm->locked_vm += npages;
+	} else {
+		if (WARN_ON_ONCE(npages > mm->locked_vm))
+			npages = mm->locked_vm;
+		mm->locked_vm -= npages;
+	}
+
+	pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
+			current->pid,
+			incr ? '+' : '-',
+			npages << PAGE_SHIFT,
+			mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK));
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+bool mm_iommu_preregistered(void)
+{
+	if (!current || !current->mm)
+		return false;
+
+	return !list_empty(&current->mm->context.iommu_group_mem_list);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
+
+long mm_iommu_get(unsigned long ua, unsigned long entries,
+		struct mm_iommu_table_group_mem_t **pmem)
+{
+	struct mm_iommu_table_group_mem_t *mem;
+	long i, j, ret = 0, locked_entries = 0;
+	struct page *page = NULL;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	mutex_lock(&mem_list_mutex);
+
+	list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			++mem->used;
+			*pmem = mem;
+			goto unlock_exit;
+		}
+
+		/* Overlap? */
+		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
+				(ua < (mem->ua +
+				       (mem->entries << PAGE_SHIFT)))) {
+			ret = -EINVAL;
+			goto unlock_exit;
+		}
+
+	}
+
+	ret = mm_iommu_adjust_locked_vm(current->mm, entries, true);
+	if (ret)
+		goto unlock_exit;
+
+	locked_entries = entries;
+
+	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+	if (!mem) {
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
+	if (!mem->hpas) {
+		kfree(mem);
+		ret = -ENOMEM;
+		goto unlock_exit;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+					1/* pages */, 1/* iswrite */, &page)) {
+			for (j = 0; j < i; ++j)
+				put_page(pfn_to_page(
+						mem->hpas[j] >> PAGE_SHIFT));
+			vfree(mem->hpas);
+			kfree(mem);
+			ret = -EFAULT;
+			goto unlock_exit;
+		}
+
+		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
+	}
+
+	atomic64_set(&mem->mapped, 1);
+	mem->used = 1;
+	mem->ua = ua;
+	mem->entries = entries;
+	*pmem = mem;
+
+	list_add_rcu(&mem->next, &current->mm->context.iommu_group_mem_list);
+
+unlock_exit:
+	if (locked_entries && ret)
+		mm_iommu_adjust_locked_vm(current->mm, locked_entries, false);
+
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_get);
+
+static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
+{
+	long i;
+	struct page *page = NULL;
+
+	for (i = 0; i < mem->entries; ++i) {
+		if (!mem->hpas[i])
+			continue;
+
+		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
+		if (!page)
+			continue;
+
+		put_page(page);
+		mem->hpas[i] = 0;
+	}
+}
+
+static void mm_iommu_do_free(struct mm_iommu_table_group_mem_t *mem)
+{
+
+	mm_iommu_unpin(mem);
+	vfree(mem->hpas);
+	kfree(mem);
+}
+
+static void mm_iommu_free(struct rcu_head *head)
+{
+	struct mm_iommu_table_group_mem_t *mem = container_of(head,
+			struct mm_iommu_table_group_mem_t, rcu);
+
+	mm_iommu_do_free(mem);
+}
+
+static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
+{
+	list_del_rcu(&mem->next);
+	mm_iommu_adjust_locked_vm(current->mm, mem->entries, false);
+	call_rcu(&mem->rcu, mm_iommu_free);
+}
+
+long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
+{
+	long ret = 0;
+
+	if (!current || !current->mm)
+		return -ESRCH; /* process exited */
+
+	mutex_lock(&mem_list_mutex);
+
+	if (mem->used == 0) {
+		ret = -ENOENT;
+		goto unlock_exit;
+	}
+
+	--mem->used;
+	/* There are still users, exit */
+	if (mem->used)
+		goto unlock_exit;
+
+	/* Are there still mappings? */
+	if (atomic_cmpxchg(&mem->mapped, 1, 0) != 1) {
+		++mem->used;
+		ret = -EBUSY;
+		goto unlock_exit;
+	}
+
+	/* @mapped became 0 so now mappings are disabled, release the region */
+	mm_iommu_release(mem);
+
+unlock_exit:
+	mutex_unlock(&mem_list_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_put);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
+		unsigned long size)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			&current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua <= ua) &&
+				(ua + size <= mem->ua +
+				 (mem->entries << PAGE_SHIFT))) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+
+struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
+		unsigned long entries)
+{
+	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+	list_for_each_entry_rcu(mem,
+			&current->mm->context.iommu_group_mem_list,
+			next) {
+		if ((mem->ua == ua) && (mem->entries == entries)) {
+			ret = mem;
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_find);
+
+long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
+		unsigned long ua, unsigned long *hpa)
+{
+	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+	u64 *va = &mem->hpas[entry];
+
+	if (entry >= mem->entries)
+		return -EFAULT;
+
+	*hpa = *va | (ua & ~PAGE_MASK);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+
+long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
+{
+	if (atomic64_inc_not_zero(&mem->mapped))
+		return 0;
+
+	/* Last mm_iommu_put() has been called, no more mappings allowed() */
+	return -ENXIO;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
+
+void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
+{
+	atomic64_add_unless(&mem->mapped, -1, 1);
+}
+EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
+
+void mm_iommu_init(mm_context_t *ctx)
+{
+	INIT_LIST_HEAD_RCU(&ctx->iommu_group_mem_list);
+}
+
+void mm_iommu_cleanup(mm_context_t *ctx)
+{
+	struct mm_iommu_table_group_mem_t *mem, *tmp;
+
+	list_for_each_entry_safe(mem, tmp, &ctx->iommu_group_mem_list, next) {
+		list_del_rcu(&mem->next);
+		mm_iommu_do_free(mem);
+	}
+}
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 59daa5eeec25..6bfadf1aa5cb 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -839,6 +839,17 @@ pmd_t pmdp_get_and_clear(struct mm_struct *mm,
 	 * hash fault look at them.
 	 */
 	memset(pgtable, 0, PTE_FRAG_SIZE);
+	/*
+	 * Serialize against find_linux_pte_or_hugepte which does lock-less
+	 * lookup in page tables with local interrupts disabled. For huge pages
+	 * it casts pmd_t to pte_t. Since format of pte_t is different from
+	 * pmd_t we want to prevent transit from pmd pointing to page table
+	 * to pmd pointing to huge page (and back) while interrupts are disabled.
+	 * We clear pmd to possibly replace it with page table pointer in
+	 * different code paths. So make sure we wait for the parallel
+	 * find_linux_pte_or_hugepage to finish.
+	 */
+	kick_all_cpus_sync();
 	return old_pmd;
 }
 
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 89bf95bd63b1..765b419883f2 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -398,18 +398,18 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
 	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500	/* Bad pgd entry or hugepage; bail */
+	bge	tlb_miss_huge_e6500	/* Bad pgd entry or hugepage; bail */
 	ldx	r14,r14,r15		/* grab pud entry */
 
 	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
 	clrrdi	r15,r15,3
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500
+	bge	tlb_miss_huge_e6500
 	ldx	r14,r14,r15		/* Grab pmd entry */
 
 	mfspr	r10,SPRN_MAS0
 	cmpdi	cr0,r14,0
-	bge	tlb_miss_fault_e6500
+	bge	tlb_miss_huge_e6500
 
 	/* Now we build the MAS for a 2M indirect page:
 	 *
@@ -428,6 +428,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
 	clrrdi	r15,r16,21		/* make EA 2M-aligned */
 	mtspr	SPRN_MAS2,r15
 
+tlb_miss_huge_done_e6500:
 	lbz	r15,TCD_ESEL_NEXT(r11)
 	lbz	r16,TCD_ESEL_MAX(r11)
 	lbz	r14,TCD_ESEL_FIRST(r11)
@@ -456,6 +457,50 @@ END_FTR_SECTION_IFSET(CPU_FTR_SMT)
 	tlb_epilog_bolted
 	rfi
 
+tlb_miss_huge_e6500:
+	beq	tlb_miss_fault_e6500
+	li	r10,1
+	andi.	r15,r14,HUGEPD_SHIFT_MASK@l /* r15 = psize */
+	rldimi	r14,r10,63,0		/* Set PD_HUGE */
+	xor	r14,r14,r15		/* Clear size bits */
+	ldx	r14,0,r14
+
+	/*
+	 * Now we build the MAS for a huge page.
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 *		 - can be handled by indirect code
+	 * MAS 1   :	Need to clear IND and set TSIZE
+	 * MAS 2,3+7:	Needs to be redone similar to non-tablewalk handler
+	 */
+
+	subi	r15,r15,10		/* Convert psize to tsize */
+	mfspr	r10,SPRN_MAS1
+	rlwinm	r10,r10,0,~MAS1_IND
+	rlwimi	r10,r15,MAS1_TSIZE_SHIFT,MAS1_TSIZE_MASK
+	mtspr	SPRN_MAS1,r10
+
+	li	r10,-0x400
+	sld	r15,r10,r15		/* Generate mask based on size */
+	and	r10,r16,r15
+	rldicr	r15,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT
+	rlwimi	r10,r14,32-19,27,31	/* Insert WIMGE */
+	clrldi	r15,r15,PAGE_SHIFT	/* Clear crap at the top */
+	rlwimi	r15,r14,32-8,22,25	/* Move in U bits */
+	mtspr	SPRN_MAS2,r10
+	andi.	r10,r14,_PAGE_DIRTY
+	rlwimi	r15,r14,32-2,26,31	/* Move in BAP bits */
+
+	/* Mask out SW and UW if !DIRTY (XXX optimize this !) */
+	bne	1f
+	li	r10,MAS3_SW|MAS3_UW
+	andc	r15,r15,r10
+1:
+	mtspr	SPRN_MAS7_MAS3,r15
+
+	mfspr	r10,SPRN_MAS0
+	b	tlb_miss_huge_done_e6500
+
 tlb_miss_kernel_e6500:
 	ld	r14,PACA_KERNELPGD(r13)
 	cmpldi	cr1,r15,8		/* Check for vmalloc region */
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index cbd3d069897f..723a099f6be3 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -217,7 +217,7 @@ static DEFINE_RAW_SPINLOCK(tlbivax_lock);
 static int mm_is_core_local(struct mm_struct *mm)
 {
 	return cpumask_subset(mm_cpumask(mm),
-			      topology_thread_cpumask(smp_processor_id()));
+			      topology_sibling_cpumask(smp_processor_id()));
 }
 
 struct tlb_flush_param {
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 12b638425bb9..d90893b76e7c 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -131,7 +131,16 @@ static void pmao_restore_workaround(bool ebb) { }
 
 static bool regs_use_siar(struct pt_regs *regs)
 {
-	return !!regs->result;
+	/*
+	 * When we take a performance monitor exception the regs are setup
+	 * using perf_read_regs() which overloads some fields, in particular
+	 * regs->result to tell us whether to use SIAR.
+	 *
+	 * However if the regs are from another exception, eg. a syscall, then
+	 * they have not been setup using perf_read_regs() and so regs->result
+	 * is something random.
+	 */
+	return ((TRAP(regs) == 0xf00) && regs->result);
 }
 
 /*
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
index c949ca055712..63016621aff8 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c
@@ -193,7 +193,7 @@ static struct irq_chip mpc52xx_gpt_irq_chip = {
 
 void mpc52xx_gpt_irq_cascade(unsigned int virq, struct irq_desc *desc)
 {
-	struct mpc52xx_gpt_priv *gpt = irq_get_handler_data(virq);
+	struct mpc52xx_gpt_priv *gpt = irq_desc_get_handler_data(desc);
 	int sub_virq;
 	u32 status;
 
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
index e2d401ad8fbb..6eb3b2abae90 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
@@ -12,7 +12,7 @@
 
 #undef DEBUG
 
-#include <asm/pci.h>
+#include <linux/pci.h>
 #include <asm/mpc52xx.h>
 #include <asm/delay.h>
 #include <asm/machdep.h>
diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index 2fb4b24368a6..97915feffd42 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -282,7 +282,7 @@ config CORENET_GENERIC
 	  For 64bit kernel, the following boards are supported:
 	    T208x QDS/RDB, T4240 QDS/RDB and B4 QDS
 	  The following boards are supported for both 32bit and 64bit kernel:
-	    P5020 DS, P5040 DS and T104xQDS/RDB
+	    P5020 DS, P5040 DS, T102x QDS/RDB, T104x QDS/RDB
 
 endif # FSL_SOC_BOOKE
 
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c
index 9824d2cf79bd..bd839dc287fe 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -150,6 +150,9 @@ static const char * const boards[] __initconst = {
 	"fsl,B4860QDS",
 	"fsl,B4420QDS",
 	"fsl,B4220QDS",
+	"fsl,T1023RDB",
+	"fsl,T1024QDS",
+	"fsl,T1024RDB",
 	"fsl,T1040QDS",
 	"fsl,T1042QDS",
 	"fsl,T1040RDB",
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 8631ac5f0e57..b8b821697910 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -345,6 +345,7 @@ void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary)
 	local_irq_disable();
 
 	if (secondary) {
+		__flush_disable_L1();
 		atomic_inc(&kexec_down_cpus);
 		/* loop forever */
 		while (1);
@@ -357,61 +358,11 @@ static void mpc85xx_smp_kexec_down(void *arg)
 		ppc_md.kexec_cpu_down(0,1);
 }
 
-static void map_and_flush(unsigned long paddr)
-{
-	struct page *page = pfn_to_page(paddr >> PAGE_SHIFT);
-	unsigned long kaddr  = (unsigned long)kmap_atomic(page);
-
-	flush_dcache_range(kaddr, kaddr + PAGE_SIZE);
-	kunmap_atomic((void *)kaddr);
-}
-
-/**
- * Before we reset the other cores, we need to flush relevant cache
- * out to memory so we don't get anything corrupted, some of these flushes
- * are performed out of an overabundance of caution as interrupts are not
- * disabled yet and we can switch cores
- */
-static void mpc85xx_smp_flush_dcache_kexec(struct kimage *image)
-{
-	kimage_entry_t *ptr, entry;
-	unsigned long paddr;
-	int i;
-
-	if (image->type == KEXEC_TYPE_DEFAULT) {
-		/* normal kexec images are stored in temporary pages */
-		for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE);
-		     ptr = (entry & IND_INDIRECTION) ?
-				phys_to_virt(entry & PAGE_MASK) : ptr + 1) {
-			if (!(entry & IND_DESTINATION)) {
-				map_and_flush(entry);
-			}
-		}
-		/* flush out last IND_DONE page */
-		map_and_flush(entry);
-	} else {
-		/* crash type kexec images are copied to the crash region */
-		for (i = 0; i < image->nr_segments; i++) {
-			struct kexec_segment *seg = &image->segment[i];
-			for (paddr = seg->mem; paddr < seg->mem + seg->memsz;
-			     paddr += PAGE_SIZE) {
-				map_and_flush(paddr);
-			}
-		}
-	}
-
-	/* also flush the kimage struct to be passed in as well */
-	flush_dcache_range((unsigned long)image,
-			   (unsigned long)image + sizeof(*image));
-}
-
 static void mpc85xx_smp_machine_kexec(struct kimage *image)
 {
 	int timeout = INT_MAX;
 	int i, num_cpus = num_present_cpus();
 
-	mpc85xx_smp_flush_dcache_kexec(image);
-
 	if (image->type == KEXEC_TYPE_DEFAULT)
 		smp_call_function(mpc85xx_smp_kexec_down, NULL, 0);
 
diff --git a/arch/powerpc/platforms/85xx/twr_p102x.c b/arch/powerpc/platforms/85xx/twr_p102x.c
index 1eadb6d0dc64..30e002f4648c 100644
--- a/arch/powerpc/platforms/85xx/twr_p102x.c
+++ b/arch/powerpc/platforms/85xx/twr_p102x.c
@@ -79,7 +79,7 @@ static void __init twr_p1025_setup_arch(void)
 	mpc85xx_qe_init();
 	mpc85xx_qe_par_io_init();
 
-#if defined(CONFIG_UCC_GETH) || defined(CONFIG_SERIAL_QE)
+#if IS_ENABLED(CONFIG_UCC_GETH) || IS_ENABLED(CONFIG_SERIAL_QE)
 	if (machine_is(twr_p1025)) {
 		struct ccsr_guts __iomem *guts;
 
@@ -101,7 +101,7 @@ static void __init twr_p1025_setup_arch(void)
 					MPC85xx_PMUXCR_QE(12));
 			iounmap(guts);
 
-#if defined(CONFIG_SERIAL_QE)
+#if IS_ENABLED(CONFIG_SERIAL_QE)
 			/* On P1025TWR board, the UCC7 acted as UART port.
 			 * However, The UCC7's CTS pin is low level in default,
 			 * it will impact the transmission in full duplex
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 7264e91190be..c140e94c7c72 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -405,6 +405,16 @@ config PPC_DOORBELL
 
 endmenu
 
+config VDSO32
+	def_bool y
+	depends on PPC32 || CPU_BIG_ENDIAN
+	help
+	  This symbol controls whether we build the 32-bit VDSO. We obviously
+	  want to do that if we're building a 32-bit kernel. If we're building
+	  a 64-bit kernel then we only want a 32-bit VDSO if we're building for
+	  big endian. That is because the only little endian configuration we
+	  support is ppc64le which is 64-bit only.
+
 choice
 	prompt "Endianness selection"
 	default CPU_BIG_ENDIAN
@@ -421,6 +431,7 @@ config CPU_BIG_ENDIAN
 
 config CPU_LITTLE_ENDIAN
 	bool "Build little endian kernel"
+	depends on PPC_BOOK3S_64
 	select PPC64_BOOT_WRAPPER
 	help
 	  Build a little endian kernel.
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 623bd961465a..fe51de4fcf13 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -22,6 +22,7 @@
 #include <asm/machdep.h>
 #include <asm/prom.h>
 
+#include "cell.h"
 
 /*
  * MSIC registers, specified as offsets from dcr_base
@@ -95,7 +96,7 @@ static void msic_dcr_write(struct axon_msic *msic, unsigned int dcr_n, u32 val)
 static void axon_msi_cascade(unsigned int irq, struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct axon_msic *msic = irq_get_handler_data(irq);
+	struct axon_msic *msic = irq_desc_get_handler_data(desc);
 	u32 write_offset, msi;
 	int idx;
 	int retry = 0;
@@ -406,8 +407,8 @@ static int axon_msi_probe(struct platform_device *device)
 
 	dev_set_drvdata(&device->dev, msic);
 
-	ppc_md.setup_msi_irqs = axon_msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = axon_msi_teardown_msi_irqs;
+	cell_pci_controller_ops.setup_msi_irqs = axon_msi_setup_msi_irqs;
+	cell_pci_controller_ops.teardown_msi_irqs = axon_msi_teardown_msi_irqs;
 
 	axon_msi_debug_setup(dn, msic);
 
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index 21b502398bf3..14a582b21274 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -466,6 +466,11 @@ static inline u32 cell_iommu_get_ioid(struct device_node *np)
 	return *ioid;
 }
 
+static struct iommu_table_ops cell_iommu_ops = {
+	.set = tce_build_cell,
+	.clear = tce_free_cell
+};
+
 static struct iommu_window * __init
 cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 			unsigned long offset, unsigned long size,
@@ -492,6 +497,7 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_offset =
 		(offset >> window->table.it_page_shift) + pte_offset;
 	window->table.it_size = size >> window->table.it_page_shift;
+	window->table.it_ops = &cell_iommu_ops;
 
 	iommu_init_table(&window->table, iommu->nid);
 
@@ -1201,8 +1207,6 @@ static int __init cell_iommu_init(void)
 	/* Setup various callbacks */
 	cell_pci_controller_ops.dma_dev_setup = cell_pci_dma_dev_setup;
 	ppc_md.dma_get_required_mask = cell_dma_get_required_mask;
-	ppc_md.tce_build = tce_build_cell;
-	ppc_md.tce_free = tce_free_cell;
 
 	if (!iommu_fixed_disabled && cell_iommu_fixed_mapping_init() == 0)
 		goto bail;
diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
index c269caee58f9..9dd154d6f89a 100644
--- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
+++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
@@ -124,7 +124,7 @@ static void hlwd_pic_irq_cascade(unsigned int cascade_virq,
 				      struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
-	struct irq_domain *irq_domain = irq_get_handler_data(cascade_virq);
+	struct irq_domain *irq_domain = irq_desc_get_handler_data(desc);
 	unsigned int virq;
 
 	raw_spin_lock(&desc->lock);
diff --git a/arch/powerpc/platforms/pasemi/Makefile b/arch/powerpc/platforms/pasemi/Makefile
index 8e8d4cae5ebe..60b4e0fd9808 100644
--- a/arch/powerpc/platforms/pasemi/Makefile
+++ b/arch/powerpc/platforms/pasemi/Makefile
@@ -1,2 +1,3 @@
 obj-y	+= setup.o pci.o time.o idle.o powersave.o iommu.o dma_lib.o misc.o
 obj-$(CONFIG_PPC_PASEMI_MDIO)	+= gpio_mdio.o
+obj-$(CONFIG_PCI_MSI)		+= msi.o
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index b8f567b2ea19..c929644e74a6 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -134,6 +134,10 @@ static void iobmap_free(struct iommu_table *tbl, long index,
 	}
 }
 
+static struct iommu_table_ops iommu_table_iobmap_ops = {
+	.set = iobmap_build,
+	.clear  = iobmap_free
+};
 
 static void iommu_table_iobmap_setup(void)
 {
@@ -153,6 +157,7 @@ static void iommu_table_iobmap_setup(void)
 	 * Should probably be 8 (64 bytes)
 	 */
 	iommu_table_iobmap.it_blocksize = 4;
+	iommu_table_iobmap.it_ops = &iommu_table_iobmap_ops;
 	iommu_init_table(&iommu_table_iobmap, 0);
 	pr_debug(" <- %s\n", __func__);
 }
@@ -252,8 +257,6 @@ void __init iommu_init_early_pasemi(void)
 
 	pasemi_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pasemi;
 	pasemi_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pasemi;
-	ppc_md.tce_build = iobmap_build;
-	ppc_md.tce_free  = iobmap_free;
 	set_pci_dma_ops(&dma_iommu_ops);
 }
 
diff --git a/arch/powerpc/sysdev/mpic_pasemi_msi.c b/arch/powerpc/platforms/pasemi/msi.c
index a3f660eed6de..27f2b187a91b 100644
--- a/arch/powerpc/sysdev/mpic_pasemi_msi.c
+++ b/arch/powerpc/platforms/pasemi/msi.c
@@ -13,8 +13,6 @@
  *
  */
 
-#undef DEBUG
-
 #include <linux/irq.h>
 #include <linux/msi.h>
 #include <asm/mpic.h>
@@ -23,7 +21,7 @@
 #include <asm/ppc-pci.h>
 #include <asm/msi_bitmap.h>
 
-#include "mpic.h"
+#include <sysdev/mpic.h>
 
 /* Allocate 16 interrupts per device, to give an alignment of 16,
  * since that's the size of the grouping w.r.t. affinity. If someone
@@ -144,6 +142,7 @@ static int pasemi_msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 int mpic_pasemi_msi_init(struct mpic *mpic)
 {
 	int rc;
+	struct pci_controller *phb;
 
 	if (!mpic->irqhost->of_node ||
 	    !of_device_is_compatible(mpic->irqhost->of_node,
@@ -159,9 +158,11 @@ int mpic_pasemi_msi_init(struct mpic *mpic)
 	pr_debug("pasemi_msi: Registering PA Semi MPIC MSI callbacks\n");
 
 	msi_mpic = mpic;
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = pasemi_msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = pasemi_msi_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = pasemi_msi_teardown_msi_irqs;
+	}
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 4b044d8cb49a..604190cab522 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -19,3 +19,10 @@ config PPC_POWERNV
 	select CPU_FREQ_GOV_CONSERVATIVE
 	select PPC_DOORBELL
 	default y
+
+config OPAL_PRD
+	tristate 'OPAL PRD driver'
+	depends on PPC_POWERNV
+	help
+	  This enables the opal-prd driver, a facility to run processor
+	  recovery diagnostics on OpenPower machines
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 33e44f37212f..1c8cdb6250e7 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,7 +1,7 @@
-obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o
+obj-y			+= setup.o opal-wrappers.o opal.o opal-async.o idle.o
 obj-y			+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y			+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
-obj-y			+= opal-msglog.o opal-hmi.o opal-power.o
+obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
@@ -9,3 +9,4 @@ obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
 obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
+obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ce738ab3d5a9..5cf5e6ea213b 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/msi.h>
 #include <linux/of.h>
@@ -40,6 +41,7 @@
 #include "pci.h"
 
 static bool pnv_eeh_nb_init = false;
+static int eeh_event_irq = -EINVAL;
 
 /**
  * pnv_eeh_init - EEH platform dependent initialization
@@ -88,34 +90,22 @@ static int pnv_eeh_init(void)
 	return 0;
 }
 
-static int pnv_eeh_event(struct notifier_block *nb,
-			 unsigned long events, void *change)
+static irqreturn_t pnv_eeh_event(int irq, void *data)
 {
-	uint64_t changed_evts = (uint64_t)change;
-
 	/*
-	 * We simply send special EEH event if EEH has
-	 * been enabled, or clear pending events in
-	 * case that we enable EEH soon
+	 * We simply send a special EEH event if EEH has been
+	 * enabled. We don't care about EEH events until we've
+	 * finished processing the outstanding ones. Event processing
+	 * gets unmasked in next_error() if EEH is enabled.
 	 */
-	if (!(changed_evts & OPAL_EVENT_PCI_ERROR) ||
-	    !(events & OPAL_EVENT_PCI_ERROR))
-		return 0;
+	disable_irq_nosync(irq);
 
 	if (eeh_enabled())
 		eeh_send_failure_event(NULL);
-	else
-		opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
-	return 0;
+	return IRQ_HANDLED;
 }
 
-static struct notifier_block pnv_eeh_nb = {
-	.notifier_call	= pnv_eeh_event,
-	.next		= NULL,
-	.priority	= 0
-};
-
 #ifdef CONFIG_DEBUG_FS
 static ssize_t pnv_eeh_ei_write(struct file *filp,
 				const char __user *user_buf,
@@ -237,16 +227,28 @@ static int pnv_eeh_post_init(void)
 
 	/* Register OPAL event notifier */
 	if (!pnv_eeh_nb_init) {
-		ret = opal_notifier_register(&pnv_eeh_nb);
-		if (ret) {
-			pr_warn("%s: Can't register OPAL event notifier (%d)\n",
-				__func__, ret);
+		eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+		if (eeh_event_irq < 0) {
+			pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+			       __func__, eeh_event_irq);
+			return eeh_event_irq;
+		}
+
+		ret = request_irq(eeh_event_irq, pnv_eeh_event,
+				IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+		if (ret < 0) {
+			irq_dispose_mapping(eeh_event_irq);
+			pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+			       __func__, eeh_event_irq);
 			return ret;
 		}
 
 		pnv_eeh_nb_init = true;
 	}
 
+	if (!eeh_enabled())
+		disable_irq(eeh_event_irq);
+
 	list_for_each_entry(hose, &hose_list, list_node) {
 		phb = hose->private_data;
 
@@ -979,7 +981,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 /**
  * pnv_eeh_wait_state - Wait for PE state
  * @pe: EEH PE
- * @max_wait: maximal period in microsecond
+ * @max_wait: maximal period in millisecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
@@ -1000,13 +1002,13 @@ static int pnv_eeh_wait_state(struct eeh_pe *pe, int max_wait)
 		if (ret != EEH_STATE_UNAVAILABLE)
 			return ret;
 
-		max_wait -= mwait;
 		if (max_wait <= 0) {
 			pr_warn("%s: Timeout getting PE#%x's state (%d)\n",
 				__func__, pe->addr, max_wait);
 			return EEH_STATE_NOT_SUPPORT;
 		}
 
+		max_wait -= mwait;
 		msleep(mwait);
 	}
 
@@ -1303,12 +1305,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 	int state, ret = EEH_NEXT_ERR_NONE;
 
 	/*
-	 * While running here, it's safe to purge the event queue.
-	 * And we should keep the cached OPAL notifier event sychronized
-	 * between the kernel and firmware.
+	 * While running here, it's safe to purge the event queue. The
+	 * event should still be masked.
 	 */
 	eeh_remove_event(NULL, false);
-	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
 	list_for_each_entry(hose, &hose_list, list_node) {
 		/*
@@ -1477,6 +1477,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 			break;
 	}
 
+	/* Unmask the event */
+	if (eeh_enabled())
+		enable_irq(eeh_event_irq);
+
 	return ret;
 }
 
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
new file mode 100644
index 000000000000..59d735d2e5c0
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -0,0 +1,293 @@
+/*
+ * PowerNV cpuidle code
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+#include <asm/opal.h>
+#include <asm/cputhreads.h>
+#include <asm/cpuidle.h>
+#include <asm/code-patching.h>
+#include <asm/smp.h>
+
+#include "powernv.h"
+#include "subcore.h"
+
+static u32 supported_cpuidle_states;
+
+int pnv_save_sprs_for_winkle(void)
+{
+	int cpu;
+	int rc;
+
+	/*
+	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross
+	 * all cpus at boot. Get these reg values of current cpu and use the
+	 * same accross all cpus.
+	 */
+	uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+	uint64_t hid0_val = mfspr(SPRN_HID0);
+	uint64_t hid1_val = mfspr(SPRN_HID1);
+	uint64_t hid4_val = mfspr(SPRN_HID4);
+	uint64_t hid5_val = mfspr(SPRN_HID5);
+	uint64_t hmeer_val = mfspr(SPRN_HMEER);
+
+	for_each_possible_cpu(cpu) {
+		uint64_t pir = get_hard_smp_processor_id(cpu);
+		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
+
+		/*
+		 * HSPRG0 is used to store the cpu's pointer to paca. Hence last
+		 * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0
+		 * with 63rd bit set, so that when a thread wakes up at 0x100 we
+		 * can use this bit to distinguish between fastsleep and
+		 * deep winkle.
+		 */
+		hsprg0_val |= 1;
+
+		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
+		if (rc != 0)
+			return rc;
+
+		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
+		if (rc != 0)
+			return rc;
+
+		/* HIDs are per core registers */
+		if (cpu_thread_in_core(cpu) == 0) {
+
+			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
+			if (rc != 0)
+				return rc;
+
+			rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
+			if (rc != 0)
+				return rc;
+		}
+	}
+
+	return 0;
+}
+
+static void pnv_alloc_idle_core_states(void)
+{
+	int i, j;
+	int nr_cores = cpu_nr_cores();
+	u32 *core_idle_state;
+
+	/*
+	 * core_idle_state - First 8 bits track the idle state of each thread
+	 * of the core. The 8th bit is the lock bit. Initially all thread bits
+	 * are set. They are cleared when the thread enters deep idle state
+	 * like sleep and winkle. Initially the lock bit is cleared.
+	 * The lock bit has 2 purposes
+	 * a. While the first thread is restoring core state, it prevents
+	 * other threads in the core from switching to process context.
+	 * b. While the last thread in the core is saving the core state, it
+	 * prevents a different thread from waking up.
+	 */
+	for (i = 0; i < nr_cores; i++) {
+		int first_cpu = i * threads_per_core;
+		int node = cpu_to_node(first_cpu);
+
+		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
+		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+
+		for (j = 0; j < threads_per_core; j++) {
+			int cpu = first_cpu + j;
+
+			paca[cpu].core_idle_state_ptr = core_idle_state;
+			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
+			paca[cpu].thread_mask = 1 << j;
+		}
+	}
+
+	update_subcore_sibling_mask();
+
+	if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED)
+		pnv_save_sprs_for_winkle();
+}
+
+u32 pnv_get_supported_cpuidle_states(void)
+{
+	return supported_cpuidle_states;
+}
+EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
+
+
+static void pnv_fastsleep_workaround_apply(void *info)
+
+{
+	int rc;
+	int *err = info;
+
+	rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+					OPAL_CONFIG_IDLE_APPLY);
+	if (rc)
+		*err = 1;
+}
+
+/*
+ * Used to store fastsleep workaround state
+ * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
+ * 1 - Workaround applied once, never undone.
+ */
+static u8 fastsleep_workaround_applyonce;
+
+static ssize_t show_fastsleep_workaround_applyonce(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", fastsleep_workaround_applyonce);
+}
+
+static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
+		struct device_attribute *attr, const char *buf,
+		size_t count)
+{
+	cpumask_t primary_thread_mask;
+	int err;
+	u8 val;
+
+	if (kstrtou8(buf, 0, &val) || val != 1)
+		return -EINVAL;
+
+	if (fastsleep_workaround_applyonce == 1)
+		return count;
+
+	/*
+	 * fastsleep_workaround_applyonce = 1 implies
+	 * fastsleep workaround needs to be left in 'applied' state on all
+	 * the cores. Do this by-
+	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
+	 * 2. Sending ipi to all the cores which have atleast one online thread
+	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
+	 * path
+	 * There is no need to send ipi to cores which have all threads
+	 * offlined, as last thread of the core entering fastsleep or deeper
+	 * state would have applied workaround.
+	 */
+	err = patch_instruction(
+		(unsigned int *)pnv_fastsleep_workaround_at_exit,
+		PPC_INST_NOP);
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
+		goto fail;
+	}
+
+	get_online_cpus();
+	primary_thread_mask = cpu_online_cores_map();
+	on_each_cpu_mask(&primary_thread_mask,
+				pnv_fastsleep_workaround_apply,
+				&err, 1);
+	put_online_cpus();
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
+		goto fail;
+	}
+
+	err = patch_instruction(
+		(unsigned int *)pnv_fastsleep_workaround_at_entry,
+		PPC_INST_NOP);
+	if (err) {
+		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
+		goto fail;
+	}
+
+	fastsleep_workaround_applyonce = 1;
+
+	return count;
+fail:
+	return -EIO;
+}
+
+static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
+			show_fastsleep_workaround_applyonce,
+			store_fastsleep_workaround_applyonce);
+
+static int __init pnv_init_idle_states(void)
+{
+	struct device_node *power_mgt;
+	int dt_idle_states;
+	u32 *flags;
+	int i;
+
+	supported_cpuidle_states = 0;
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		goto out;
+
+	if (!firmware_has_feature(FW_FEATURE_OPALv3))
+		goto out;
+
+	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
+	if (!power_mgt) {
+		pr_warn("opal: PowerMgmt Node not found\n");
+		goto out;
+	}
+	dt_idle_states = of_property_count_u32_elems(power_mgt,
+			"ibm,cpu-idle-state-flags");
+	if (dt_idle_states < 0) {
+		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
+		goto out;
+	}
+
+	flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL);
+	if (of_property_read_u32_array(power_mgt,
+			"ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
+		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
+		goto out_free;
+	}
+
+	for (i = 0; i < dt_idle_states; i++)
+		supported_cpuidle_states |= flags[i];
+
+	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+		patch_instruction(
+			(unsigned int *)pnv_fastsleep_workaround_at_entry,
+			PPC_INST_NOP);
+		patch_instruction(
+			(unsigned int *)pnv_fastsleep_workaround_at_exit,
+			PPC_INST_NOP);
+	} else {
+		/*
+		 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+		 * workaround is needed to use fastsleep. Provide sysfs
+		 * control to choose how this workaround has to be applied.
+		 */
+		device_create_file(cpu_subsys.dev_root,
+				&dev_attr_fastsleep_workaround_applyonce);
+	}
+
+	pnv_alloc_idle_core_states();
+out_free:
+	kfree(flags);
+out:
+	return 0;
+}
+machine_subsys_initcall(powernv, pnv_init_idle_states);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 693b6cdac691..bdc8c0c71d15 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -151,7 +151,7 @@ static struct notifier_block opal_async_comp_nb = {
 		.priority	= 0,
 };
 
-static int __init opal_async_comp_init(void)
+int __init opal_async_comp_init(void)
 {
 	struct device_node *opal_node;
 	const __be32 *async;
@@ -205,4 +205,3 @@ out_opal_node:
 out:
 	return err;
 }
-machine_subsys_initcall(powernv, opal_async_comp_init);
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
index 5aa9c1ce4de3..2ee96431f736 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -15,6 +15,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/delay.h>
+#include <linux/interrupt.h>
 
 #include <asm/opal.h>
 
@@ -60,7 +61,7 @@ static ssize_t dump_type_show(struct dump_obj *dump_obj,
 			      struct dump_attribute *attr,
 			      char *buf)
 {
-	
+
 	return sprintf(buf, "0x%x %s\n", dump_obj->type,
 		       dump_type_to_string(dump_obj->type));
 }
@@ -363,7 +364,7 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
 	return dump;
 }
 
-static int process_dump(void)
+static irqreturn_t process_dump(int irq, void *data)
 {
 	int rc;
 	uint32_t dump_id, dump_size, dump_type;
@@ -387,45 +388,13 @@ static int process_dump(void)
 	if (!dump)
 		return -1;
 
-	return 0;
-}
-
-static void dump_work_fn(struct work_struct *work)
-{
-	process_dump();
+	return IRQ_HANDLED;
 }
 
-static DECLARE_WORK(dump_work, dump_work_fn);
-
-static void schedule_process_dump(void)
-{
-	schedule_work(&dump_work);
-}
-
-/*
- * New dump available notification
- *
- * Once we get notification, we add sysfs entries for it.
- * We only fetch the dump on demand, and create sysfs asynchronously.
- */
-static int dump_event(struct notifier_block *nb,
-		      unsigned long events, void *change)
-{
-	if (events & OPAL_EVENT_DUMP_AVAIL)
-		schedule_process_dump();
-
-	return 0;
-}
-
-static struct notifier_block dump_nb = {
-	.notifier_call  = dump_event,
-	.next           = NULL,
-	.priority       = 0
-};
-
 void __init opal_platform_dump_init(void)
 {
 	int rc;
+	int dump_irq;
 
 	/* ELOG not supported by firmware */
 	if (!opal_check_token(OPAL_DUMP_READ))
@@ -445,10 +414,19 @@ void __init opal_platform_dump_init(void)
 		return;
 	}
 
-	rc = opal_notifier_register(&dump_nb);
+	dump_irq = opal_event_request(ilog2(OPAL_EVENT_DUMP_AVAIL));
+	if (!dump_irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, dump_irq);
+		return;
+	}
+
+	rc = request_threaded_irq(dump_irq, NULL, process_dump,
+				IRQF_TRIGGER_HIGH | IRQF_ONESHOT,
+				"opal-dump", NULL);
 	if (rc) {
-		pr_warn("%s: Can't register OPAL event notifier (%d)\n",
-			__func__, rc);
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, rc);
 		return;
 	}
 
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
index 38ce757e5e2a..4949ef0d9400 100644
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -10,6 +10,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/of.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
@@ -276,24 +277,15 @@ static void elog_work_fn(struct work_struct *work)
 
 static DECLARE_WORK(elog_work, elog_work_fn);
 
-static int elog_event(struct notifier_block *nb,
-				unsigned long events, void *change)
+static irqreturn_t elog_event(int irq, void *data)
 {
-	/* check for error log event */
-	if (events & OPAL_EVENT_ERROR_LOG_AVAIL)
-		schedule_work(&elog_work);
-	return 0;
+	schedule_work(&elog_work);
+	return IRQ_HANDLED;
 }
 
-static struct notifier_block elog_nb = {
-	.notifier_call  = elog_event,
-	.next           = NULL,
-	.priority       = 0
-};
-
 int __init opal_elog_init(void)
 {
-	int rc = 0;
+	int rc = 0, irq;
 
 	/* ELOG not supported by firmware */
 	if (!opal_check_token(OPAL_ELOG_READ))
@@ -305,10 +297,18 @@ int __init opal_elog_init(void)
 		return -1;
 	}
 
-	rc = opal_notifier_register(&elog_nb);
+	irq = opal_event_request(ilog2(OPAL_EVENT_ERROR_LOG_AVAIL));
+	if (!irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, irq);
+		return irq;
+	}
+
+	rc = request_irq(irq, elog_event,
+			IRQ_TYPE_LEVEL_HIGH, "opal-elog", NULL);
 	if (rc) {
-		pr_err("%s: Can't register OPAL event notifier (%d)\n",
-		__func__, rc);
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
+		       __func__, rc);
 		return rc;
 	}
 
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index b322bfb51343..a8f49d380449 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -170,7 +170,7 @@ static struct notifier_block opal_hmi_handler_nb = {
 	.priority	= 0,
 };
 
-static int __init opal_hmi_handler_init(void)
+int __init opal_hmi_handler_init(void)
 {
 	int ret;
 
@@ -186,4 +186,3 @@ static int __init opal_hmi_handler_init(void)
 	}
 	return 0;
 }
-machine_subsys_initcall(powernv, opal_hmi_handler_init);
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
new file mode 100644
index 000000000000..e2e7d75f52f3
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -0,0 +1,253 @@
+/*
+ * This file implements an irqchip for OPAL events. Whenever there is
+ * an interrupt that is handled by OPAL we get passed a list of events
+ * that Linux needs to do something about. These basically look like
+ * interrupts to Linux so we implement an irqchip to handle them.
+ *
+ * Copyright Alistair Popple, IBM Corporation 2014.
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+#include <linux/bitops.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/irq_work.h>
+
+#include <asm/machdep.h>
+#include <asm/opal.h>
+
+#include "powernv.h"
+
+/* Maximum number of events supported by OPAL firmware */
+#define MAX_NUM_EVENTS 64
+
+struct opal_event_irqchip {
+	struct irq_chip irqchip;
+	struct irq_domain *domain;
+	unsigned long mask;
+};
+static struct opal_event_irqchip opal_event_irqchip;
+
+static unsigned int opal_irq_count;
+static unsigned int *opal_irqs;
+
+static void opal_handle_irq_work(struct irq_work *work);
+static __be64 last_outstanding_events;
+static struct irq_work opal_event_irq_work = {
+	.func = opal_handle_irq_work,
+};
+
+static void opal_event_mask(struct irq_data *d)
+{
+	clear_bit(d->hwirq, &opal_event_irqchip.mask);
+}
+
+static void opal_event_unmask(struct irq_data *d)
+{
+	set_bit(d->hwirq, &opal_event_irqchip.mask);
+
+	opal_poll_events(&last_outstanding_events);
+	if (last_outstanding_events & opal_event_irqchip.mask)
+		/* Need to retrigger the interrupt */
+		irq_work_queue(&opal_event_irq_work);
+}
+
+static int opal_event_set_type(struct irq_data *d, unsigned int flow_type)
+{
+	/*
+	 * For now we only support level triggered events. The irq
+	 * handler will be called continuously until the event has
+	 * been cleared in OPAL.
+	 */
+	if (flow_type != IRQ_TYPE_LEVEL_HIGH)
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct opal_event_irqchip opal_event_irqchip = {
+	.irqchip = {
+		.name = "OPAL EVT",
+		.irq_mask = opal_event_mask,
+		.irq_unmask = opal_event_unmask,
+		.irq_set_type = opal_event_set_type,
+	},
+	.mask = 0,
+};
+
+static int opal_event_map(struct irq_domain *d, unsigned int irq,
+			irq_hw_number_t hwirq)
+{
+	irq_set_chip_data(irq, &opal_event_irqchip);
+	irq_set_chip_and_handler(irq, &opal_event_irqchip.irqchip,
+				handle_level_irq);
+
+	return 0;
+}
+
+void opal_handle_events(uint64_t events)
+{
+	int virq, hwirq = 0;
+	u64 mask = opal_event_irqchip.mask;
+
+	if (!in_irq() && (events & mask)) {
+		last_outstanding_events = events;
+		irq_work_queue(&opal_event_irq_work);
+		return;
+	}
+
+	while (events & mask) {
+		hwirq = fls64(events) - 1;
+		if (BIT_ULL(hwirq) & mask) {
+			virq = irq_find_mapping(opal_event_irqchip.domain,
+						hwirq);
+			if (virq)
+				generic_handle_irq(virq);
+		}
+		events &= ~BIT_ULL(hwirq);
+	}
+}
+
+static irqreturn_t opal_interrupt(int irq, void *data)
+{
+	__be64 events;
+
+	opal_handle_interrupt(virq_to_hw(irq), &events);
+	opal_handle_events(be64_to_cpu(events));
+
+	return IRQ_HANDLED;
+}
+
+static void opal_handle_irq_work(struct irq_work *work)
+{
+	opal_handle_events(be64_to_cpu(last_outstanding_events));
+}
+
+static int opal_event_match(struct irq_domain *h, struct device_node *node)
+{
+	return h->of_node == node;
+}
+
+static int opal_event_xlate(struct irq_domain *h, struct device_node *np,
+			   const u32 *intspec, unsigned int intsize,
+			   irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+{
+	*out_hwirq = intspec[0];
+	*out_flags = IRQ_TYPE_LEVEL_HIGH;
+
+	return 0;
+}
+
+static const struct irq_domain_ops opal_event_domain_ops = {
+	.match	= opal_event_match,
+	.map	= opal_event_map,
+	.xlate	= opal_event_xlate,
+};
+
+void opal_event_shutdown(void)
+{
+	unsigned int i;
+
+	/* First free interrupts, which will also mask them */
+	for (i = 0; i < opal_irq_count; i++) {
+		if (opal_irqs[i])
+			free_irq(opal_irqs[i], NULL);
+		opal_irqs[i] = 0;
+	}
+}
+
+int __init opal_event_init(void)
+{
+	struct device_node *dn, *opal_node;
+	const __be32 *irqs;
+	int i, irqlen, rc = 0;
+
+	opal_node = of_find_node_by_path("/ibm,opal");
+	if (!opal_node) {
+		pr_warn("opal: Node not found\n");
+		return -ENODEV;
+	}
+
+	/* If dn is NULL it means the domain won't be linked to a DT
+	 * node so therefore irq_of_parse_and_map(...) wont work. But
+	 * that shouldn't be problem because if we're running a
+	 * version of skiboot that doesn't have the dn then the
+	 * devices won't have the correct properties and will have to
+	 * fall back to the legacy method (opal_event_request(...))
+	 * anyway. */
+	dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event");
+	opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS,
+				&opal_event_domain_ops, &opal_event_irqchip);
+	of_node_put(dn);
+	if (!opal_event_irqchip.domain) {
+		pr_warn("opal: Unable to create irq domain\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	/* Get interrupt property */
+	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
+	opal_irq_count = irqs ? (irqlen / 4) : 0;
+	pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count);
+
+	/* Install interrupt handlers */
+	opal_irqs = kcalloc(opal_irq_count, sizeof(*opal_irqs), GFP_KERNEL);
+	for (i = 0; irqs && i < opal_irq_count; i++, irqs++) {
+		unsigned int irq, virq;
+
+		/* Get hardware and virtual IRQ */
+		irq = be32_to_cpup(irqs);
+		virq = irq_create_mapping(NULL, irq);
+		if (virq == NO_IRQ) {
+			pr_warn("Failed to map irq 0x%x\n", irq);
+			continue;
+		}
+
+		/* Install interrupt handler */
+		rc = request_irq(virq, opal_interrupt, 0, "opal", NULL);
+		if (rc) {
+			irq_dispose_mapping(virq);
+			pr_warn("Error %d requesting irq %d (0x%x)\n",
+				 rc, virq, irq);
+			continue;
+		}
+
+		/* Cache IRQ */
+		opal_irqs[i] = virq;
+	}
+
+out:
+	of_node_put(opal_node);
+	return rc;
+}
+machine_arch_initcall(powernv, opal_event_init);
+
+/**
+ * opal_event_request(unsigned int opal_event_nr) - Request an event
+ * @opal_event_nr: the opal event number to request
+ *
+ * This routine can be used to find the linux virq number which can
+ * then be passed to request_irq to assign a handler for a particular
+ * opal event. This should only be used by legacy devices which don't
+ * have proper device tree bindings. Most devices should use
+ * irq_of_parse_and_map() instead.
+ */
+int opal_event_request(unsigned int opal_event_nr)
+{
+	if (WARN_ON_ONCE(!opal_event_irqchip.domain))
+		return NO_IRQ;
+
+	return irq_create_mapping(opal_event_irqchip.domain, opal_event_nr);
+}
+EXPORT_SYMBOL(opal_event_request);
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 43db2136dbff..00a29432be39 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -144,4 +144,4 @@ static int __init opal_mem_err_init(void)
 	}
 	return 0;
 }
-machine_subsys_initcall(powernv, opal_mem_err_init);
+machine_device_initcall(powernv, opal_mem_err_init);
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
new file mode 100644
index 000000000000..46cb3feb0a13
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -0,0 +1,449 @@
+/*
+ * OPAL Runtime Diagnostics interface driver
+ * Supported on POWERNV platform
+ *
+ * Copyright IBM Corporation 2015
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "opal-prd: " fmt
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/poll.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <asm/opal-prd.h>
+#include <asm/opal.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+
+/**
+ * The msg member must be at the end of the struct, as it's followed by the
+ * message data.
+ */
+struct opal_prd_msg_queue_item {
+	struct list_head		list;
+	struct opal_prd_msg_header	msg;
+};
+
+static struct device_node *prd_node;
+static LIST_HEAD(opal_prd_msg_queue);
+static DEFINE_SPINLOCK(opal_prd_msg_queue_lock);
+static DECLARE_WAIT_QUEUE_HEAD(opal_prd_msg_wait);
+static atomic_t prd_usage;
+
+static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size)
+{
+	struct device_node *parent, *node;
+	bool found;
+
+	if (addr + size < addr)
+		return false;
+
+	parent = of_find_node_by_path("/reserved-memory");
+	if (!parent)
+		return false;
+
+	found = false;
+
+	for_each_child_of_node(parent, node) {
+		uint64_t range_addr, range_size, range_end;
+		const __be32 *addrp;
+		const char *label;
+
+		addrp = of_get_address(node, 0, &range_size, NULL);
+
+		range_addr = of_read_number(addrp, 2);
+		range_end = range_addr + range_size;
+
+		label = of_get_property(node, "ibm,prd-label", NULL);
+
+		/* PRD ranges need a label */
+		if (!label)
+			continue;
+
+		if (range_end <= range_addr)
+			continue;
+
+		if (addr >= range_addr && addr + size <= range_end) {
+			found = true;
+			of_node_put(node);
+			break;
+		}
+	}
+
+	of_node_put(parent);
+	return found;
+}
+
+static int opal_prd_open(struct inode *inode, struct file *file)
+{
+	/*
+	 * Prevent multiple (separate) processes from concurrent interactions
+	 * with the FW PRD channel
+	 */
+	if (atomic_xchg(&prd_usage, 1) == 1)
+		return -EBUSY;
+
+	return 0;
+}
+
+/*
+ * opal_prd_mmap - maps firmware-provided ranges into userspace
+ * @file: file structure for the device
+ * @vma: VMA to map the registers into
+ */
+
+static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t addr, size;
+	int rc;
+
+	pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n",
+			vma->vm_start, vma->vm_end, vma->vm_pgoff,
+			vma->vm_flags);
+
+	addr = vma->vm_pgoff << PAGE_SHIFT;
+	size = vma->vm_end - vma->vm_start;
+
+	/* ensure we're mapping within one of the allowable ranges */
+	if (!opal_prd_range_is_valid(addr, size))
+		return -EINVAL;
+
+	vma->vm_page_prot = __pgprot(pgprot_val(phys_mem_access_prot(file,
+						vma->vm_pgoff,
+						 size, vma->vm_page_prot))
+					| _PAGE_SPECIAL);
+
+	rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
+			vma->vm_page_prot);
+
+	return rc;
+}
+
+static bool opal_msg_queue_empty(void)
+{
+	unsigned long flags;
+	bool ret;
+
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	ret = list_empty(&opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+	return ret;
+}
+
+static unsigned int opal_prd_poll(struct file *file,
+		struct poll_table_struct *wait)
+{
+	poll_wait(file, &opal_prd_msg_wait, wait);
+
+	if (!opal_msg_queue_empty())
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static ssize_t opal_prd_read(struct file *file, char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct opal_prd_msg_queue_item *item;
+	unsigned long flags;
+	ssize_t size, err;
+	int rc;
+
+	/* we need at least a header's worth of data */
+	if (count < sizeof(item->msg))
+		return -EINVAL;
+
+	if (*ppos)
+		return -ESPIPE;
+
+	item = NULL;
+
+	for (;;) {
+
+		spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+		if (!list_empty(&opal_prd_msg_queue)) {
+			item = list_first_entry(&opal_prd_msg_queue,
+					struct opal_prd_msg_queue_item, list);
+			list_del(&item->list);
+		}
+		spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+		if (item)
+			break;
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		rc = wait_event_interruptible(opal_prd_msg_wait,
+				!opal_msg_queue_empty());
+		if (rc)
+			return -EINTR;
+	}
+
+	size = be16_to_cpu(item->msg.size);
+	if (size > count) {
+		err = -EINVAL;
+		goto err_requeue;
+	}
+
+	rc = copy_to_user(buf, &item->msg, size);
+	if (rc) {
+		err = -EFAULT;
+		goto err_requeue;
+	}
+
+	kfree(item);
+
+	return size;
+
+err_requeue:
+	/* eep! re-queue at the head of the list */
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	list_add(&item->list, &opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+	return err;
+}
+
+static ssize_t opal_prd_write(struct file *file, const char __user *buf,
+		size_t count, loff_t *ppos)
+{
+	struct opal_prd_msg_header hdr;
+	ssize_t size;
+	void *msg;
+	int rc;
+
+	size = sizeof(hdr);
+
+	if (count < size)
+		return -EINVAL;
+
+	/* grab the header */
+	rc = copy_from_user(&hdr, buf, sizeof(hdr));
+	if (rc)
+		return -EFAULT;
+
+	size = be16_to_cpu(hdr.size);
+
+	msg = kmalloc(size, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	rc = copy_from_user(msg, buf, size);
+	if (rc) {
+		size = -EFAULT;
+		goto out_free;
+	}
+
+	rc = opal_prd_msg(msg);
+	if (rc) {
+		pr_warn("write: opal_prd_msg returned %d\n", rc);
+		size = -EIO;
+	}
+
+out_free:
+	kfree(msg);
+
+	return size;
+}
+
+static int opal_prd_release(struct inode *inode, struct file *file)
+{
+	struct opal_prd_msg_header msg;
+
+	msg.size = cpu_to_be16(sizeof(msg));
+	msg.type = OPAL_PRD_MSG_TYPE_FINI;
+
+	opal_prd_msg((struct opal_prd_msg *)&msg);
+
+	atomic_xchg(&prd_usage, 0);
+
+	return 0;
+}
+
+static long opal_prd_ioctl(struct file *file, unsigned int cmd,
+		unsigned long param)
+{
+	struct opal_prd_info info;
+	struct opal_prd_scom scom;
+	int rc = 0;
+
+	switch (cmd) {
+	case OPAL_PRD_GET_INFO:
+		memset(&info, 0, sizeof(info));
+		info.version = OPAL_PRD_KERNEL_VERSION;
+		rc = copy_to_user((void __user *)param, &info, sizeof(info));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	case OPAL_PRD_SCOM_READ:
+		rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+
+		scom.rc = opal_xscom_read(scom.chip, scom.addr,
+				(__be64 *)&scom.data);
+		scom.data = be64_to_cpu(scom.data);
+		pr_devel("ioctl SCOM_READ: chip %llx addr %016llx data %016llx rc %lld\n",
+				scom.chip, scom.addr, scom.data, scom.rc);
+
+		rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	case OPAL_PRD_SCOM_WRITE:
+		rc = copy_from_user(&scom, (void __user *)param, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+
+		scom.rc = opal_xscom_write(scom.chip, scom.addr, scom.data);
+		pr_devel("ioctl SCOM_WRITE: chip %llx addr %016llx data %016llx rc %lld\n",
+				scom.chip, scom.addr, scom.data, scom.rc);
+
+		rc = copy_to_user((void __user *)param, &scom, sizeof(scom));
+		if (rc)
+			return -EFAULT;
+		break;
+
+	default:
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static const struct file_operations opal_prd_fops = {
+	.open		= opal_prd_open,
+	.mmap		= opal_prd_mmap,
+	.poll		= opal_prd_poll,
+	.read		= opal_prd_read,
+	.write		= opal_prd_write,
+	.unlocked_ioctl	= opal_prd_ioctl,
+	.release	= opal_prd_release,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice opal_prd_dev = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "opal-prd",
+	.fops		= &opal_prd_fops,
+};
+
+/* opal interface */
+static int opal_prd_msg_notifier(struct notifier_block *nb,
+		unsigned long msg_type, void *_msg)
+{
+	struct opal_prd_msg_queue_item *item;
+	struct opal_prd_msg_header *hdr;
+	struct opal_msg *msg = _msg;
+	int msg_size, item_size;
+	unsigned long flags;
+
+	if (msg_type != OPAL_MSG_PRD)
+		return 0;
+
+	/* Calculate total size of the message and item we need to store. The
+	 * 'size' field in the header includes the header itself. */
+	hdr = (void *)msg->params;
+	msg_size = be16_to_cpu(hdr->size);
+	item_size = msg_size + sizeof(*item) - sizeof(item->msg);
+
+	item = kzalloc(item_size, GFP_ATOMIC);
+	if (!item)
+		return -ENOMEM;
+
+	memcpy(&item->msg, msg->params, msg_size);
+
+	spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
+	list_add_tail(&item->list, &opal_prd_msg_queue);
+	spin_unlock_irqrestore(&opal_prd_msg_queue_lock, flags);
+
+	wake_up_interruptible(&opal_prd_msg_wait);
+
+	return 0;
+}
+
+static struct notifier_block opal_prd_event_nb = {
+	.notifier_call	= opal_prd_msg_notifier,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int opal_prd_probe(struct platform_device *pdev)
+{
+	int rc;
+
+	if (!pdev || !pdev->dev.of_node)
+		return -ENODEV;
+
+	/* We should only have one prd driver instance per machine; ensure
+	 * that we only get a valid probe on a single OF node.
+	 */
+	if (prd_node)
+		return -EBUSY;
+
+	prd_node = pdev->dev.of_node;
+
+	rc = opal_message_notifier_register(OPAL_MSG_PRD, &opal_prd_event_nb);
+	if (rc) {
+		pr_err("Couldn't register event notifier\n");
+		return rc;
+	}
+
+	rc = misc_register(&opal_prd_dev);
+	if (rc) {
+		pr_err("failed to register miscdev\n");
+		opal_message_notifier_unregister(OPAL_MSG_PRD,
+				&opal_prd_event_nb);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int opal_prd_remove(struct platform_device *pdev)
+{
+	misc_deregister(&opal_prd_dev);
+	opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+	return 0;
+}
+
+static const struct of_device_id opal_prd_match[] = {
+	{ .compatible = "ibm,opal-prd" },
+	{ },
+};
+
+static struct platform_driver opal_prd_driver = {
+	.driver = {
+		.name		= "opal-prd",
+		.owner		= THIS_MODULE,
+		.of_match_table	= opal_prd_match,
+	},
+	.probe	= opal_prd_probe,
+	.remove	= opal_prd_remove,
+};
+
+module_platform_driver(opal_prd_driver);
+
+MODULE_DEVICE_TABLE(of, opal_prd_match);
+MODULE_DESCRIPTION("PowerNV OPAL runtime diagnostic driver");
+MODULE_LICENSE("GPL");
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 655250499d18..a06059df9239 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -77,7 +77,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(opal_get_sensor_data);
 
-static __init int opal_sensor_init(void)
+int __init opal_sensor_init(void)
 {
 	struct platform_device *pdev;
 	struct device_node *sensor;
@@ -93,4 +93,3 @@ static __init int opal_sensor_init(void)
 
 	return PTR_ERR_OR_ZERO(pdev);
 }
-machine_subsys_initcall(powernv, opal_sensor_init);
diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
index 9d1acf22a099..afe66c576a38 100644
--- a/arch/powerpc/platforms/powernv/opal-sysparam.c
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -55,8 +55,10 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
 	}
 
 	ret = opal_get_param(token, param_id, (u64)buffer, length);
-	if (ret != OPAL_ASYNC_COMPLETION)
+	if (ret != OPAL_ASYNC_COMPLETION) {
+		ret = opal_error_code(ret);
 		goto out_token;
+	}
 
 	ret = opal_async_wait_response(token, &msg);
 	if (ret) {
@@ -65,7 +67,7 @@ static ssize_t opal_get_sys_param(u32 param_id, u32 length, void *buffer)
 		goto out_token;
 	}
 
-	ret = be64_to_cpu(msg.params[1]);
+	ret = opal_error_code(be64_to_cpu(msg.params[1]));
 
 out_token:
 	opal_async_release_token(token);
@@ -89,8 +91,10 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
 
 	ret = opal_set_param(token, param_id, (u64)buffer, length);
 
-	if (ret != OPAL_ASYNC_COMPLETION)
+	if (ret != OPAL_ASYNC_COMPLETION) {
+		ret = opal_error_code(ret);
 		goto out_token;
+	}
 
 	ret = opal_async_wait_response(token, &msg);
 	if (ret) {
@@ -99,7 +103,7 @@ static int opal_set_sys_param(u32 param_id, u32 length, void *buffer)
 		goto out_token;
 	}
 
-	ret = be64_to_cpu(msg.params[1]);
+	ret = opal_error_code(be64_to_cpu(msg.params[1]));
 
 out_token:
 	opal_async_release_token(token);
@@ -162,10 +166,20 @@ void __init opal_sys_param_init(void)
 		goto out;
 	}
 
+	/* Some systems do not use sysparams; this is not an error */
+	sysparam = of_find_node_by_path("/ibm,opal/sysparams");
+	if (!sysparam)
+		goto out;
+
+	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
+		pr_err("SYSPARAM: Opal sysparam node not compatible\n");
+		goto out_node_put;
+	}
+
 	sysparam_kobj = kobject_create_and_add("sysparams", opal_kobj);
 	if (!sysparam_kobj) {
 		pr_err("SYSPARAM: Failed to create sysparam kobject\n");
-		goto out;
+		goto out_node_put;
 	}
 
 	/* Allocate big enough buffer for any get/set transactions */
@@ -176,30 +190,19 @@ void __init opal_sys_param_init(void)
 		goto out_kobj_put;
 	}
 
-	sysparam = of_find_node_by_path("/ibm,opal/sysparams");
-	if (!sysparam) {
-		pr_err("SYSPARAM: Opal sysparam node not found\n");
-		goto out_param_buf;
-	}
-
-	if (!of_device_is_compatible(sysparam, "ibm,opal-sysparams")) {
-		pr_err("SYSPARAM: Opal sysparam node not compatible\n");
-		goto out_node_put;
-	}
-
 	/* Number of parameters exposed through DT */
 	count = of_property_count_strings(sysparam, "param-name");
 	if (count < 0) {
 		pr_err("SYSPARAM: No string found of property param-name in "
 				"the node %s\n", sysparam->name);
-		goto out_node_put;
+		goto out_param_buf;
 	}
 
 	id = kzalloc(sizeof(*id) * count, GFP_KERNEL);
 	if (!id) {
 		pr_err("SYSPARAM: Failed to allocate memory to read parameter "
 				"id\n");
-		goto out_node_put;
+		goto out_param_buf;
 	}
 
 	size = kzalloc(sizeof(*size) * count, GFP_KERNEL);
@@ -293,12 +296,12 @@ out_free_size:
 	kfree(size);
 out_free_id:
 	kfree(id);
-out_node_put:
-	of_node_put(sysparam);
 out_param_buf:
 	kfree(param_data_buf);
 out_kobj_put:
 	kobject_put(sysparam_kobj);
+out_node_put:
+	of_node_put(sysparam);
 out:
 	return;
 }
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index a7ade94cdf87..d6a7b8252e4d 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -283,6 +283,7 @@ OPAL_CALL(opal_sensor_read,			OPAL_SENSOR_READ);
 OPAL_CALL(opal_get_param,			OPAL_GET_PARAM);
 OPAL_CALL(opal_set_param,			OPAL_SET_PARAM);
 OPAL_CALL(opal_handle_hmi,			OPAL_HANDLE_HMI);
+OPAL_CALL(opal_config_cpu_idle_state,		OPAL_CONFIG_CPU_IDLE_STATE);
 OPAL_CALL(opal_slw_set_reg,			OPAL_SLW_SET_REG);
 OPAL_CALL(opal_register_dump_region,		OPAL_REGISTER_DUMP_REGION);
 OPAL_CALL(opal_unregister_dump_region,		OPAL_UNREGISTER_DUMP_REGION);
@@ -295,3 +296,4 @@ OPAL_CALL(opal_i2c_request,			OPAL_I2C_REQUEST);
 OPAL_CALL(opal_flash_read,			OPAL_FLASH_READ);
 OPAL_CALL(opal_flash_write,			OPAL_FLASH_WRITE);
 OPAL_CALL(opal_flash_erase,			OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg,				OPAL_PRD_MSG);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2241565b0739..f084afa0e3ba 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -53,13 +53,7 @@ static int mc_recoverable_range_len;
 
 struct device_node *opal_node;
 static DEFINE_SPINLOCK(opal_write_lock);
-static unsigned int *opal_irqs;
-static unsigned int opal_irq_count;
-static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
 static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
-static DEFINE_SPINLOCK(opal_notifier_lock);
-static uint64_t last_notified_mask = 0x0ul;
-static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
 static uint32_t opal_heartbeat;
 
 static void opal_reinit_cores(void)
@@ -225,82 +219,6 @@ static int __init opal_register_exception_handlers(void)
 }
 machine_early_initcall(powernv, opal_register_exception_handlers);
 
-int opal_notifier_register(struct notifier_block *nb)
-{
-	if (!nb) {
-		pr_warning("%s: Invalid argument (%p)\n",
-			   __func__, nb);
-		return -EINVAL;
-	}
-
-	atomic_notifier_chain_register(&opal_notifier_head, nb);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(opal_notifier_register);
-
-int opal_notifier_unregister(struct notifier_block *nb)
-{
-	if (!nb) {
-		pr_warning("%s: Invalid argument (%p)\n",
-			   __func__, nb);
-		return -EINVAL;
-	}
-
-	atomic_notifier_chain_unregister(&opal_notifier_head, nb);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(opal_notifier_unregister);
-
-static void opal_do_notifier(uint64_t events)
-{
-	unsigned long flags;
-	uint64_t changed_mask;
-
-	if (atomic_read(&opal_notifier_hold))
-		return;
-
-	spin_lock_irqsave(&opal_notifier_lock, flags);
-	changed_mask = last_notified_mask ^ events;
-	last_notified_mask = events;
-	spin_unlock_irqrestore(&opal_notifier_lock, flags);
-
-	/*
-	 * We feed with the event bits and changed bits for
-	 * enough information to the callback.
-	 */
-	atomic_notifier_call_chain(&opal_notifier_head,
-				   events, (void *)changed_mask);
-}
-
-void opal_notifier_update_evt(uint64_t evt_mask,
-			      uint64_t evt_val)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&opal_notifier_lock, flags);
-	last_notified_mask &= ~evt_mask;
-	last_notified_mask |= evt_val;
-	spin_unlock_irqrestore(&opal_notifier_lock, flags);
-}
-
-void opal_notifier_enable(void)
-{
-	int64_t rc;
-	__be64 evt = 0;
-
-	atomic_set(&opal_notifier_hold, 0);
-
-	/* Process pending events */
-	rc = opal_poll_events(&evt);
-	if (rc == OPAL_SUCCESS && evt)
-		opal_do_notifier(be64_to_cpu(evt));
-}
-
-void opal_notifier_disable(void)
-{
-	atomic_set(&opal_notifier_hold, 1);
-}
-
 /*
  * Opal message notifier based on message type. Allow subscribers to get
  * notified for specific messgae type.
@@ -317,6 +235,7 @@ int opal_message_notifier_register(enum opal_msg_type msg_type,
 	return atomic_notifier_chain_register(
 				&opal_msg_notifier_head[msg_type], nb);
 }
+EXPORT_SYMBOL_GPL(opal_message_notifier_register);
 
 int opal_message_notifier_unregister(enum opal_msg_type msg_type,
 				     struct notifier_block *nb)
@@ -324,6 +243,7 @@ int opal_message_notifier_unregister(enum opal_msg_type msg_type,
 	return atomic_notifier_chain_unregister(
 			&opal_msg_notifier_head[msg_type], nb);
 }
+EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
 
 static void opal_message_do_notify(uint32_t msg_type, void *msg)
 {
@@ -364,36 +284,36 @@ static void opal_handle_message(void)
 	opal_message_do_notify(type, (void *)&msg);
 }
 
-static int opal_message_notify(struct notifier_block *nb,
-			  unsigned long events, void *change)
+static irqreturn_t opal_message_notify(int irq, void *data)
 {
-	if (events & OPAL_EVENT_MSG_PENDING)
-		opal_handle_message();
-	return 0;
+	opal_handle_message();
+	return IRQ_HANDLED;
 }
 
-static struct notifier_block opal_message_nb = {
-	.notifier_call	= opal_message_notify,
-	.next		= NULL,
-	.priority	= 0,
-};
-
 static int __init opal_message_init(void)
 {
-	int ret, i;
+	int ret, i, irq;
 
 	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
 		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
 
-	ret = opal_notifier_register(&opal_message_nb);
+	irq = opal_event_request(ilog2(OPAL_EVENT_MSG_PENDING));
+	if (!irq) {
+		pr_err("%s: Can't register OPAL event irq (%d)\n",
+		       __func__, irq);
+		return irq;
+	}
+
+	ret = request_irq(irq, opal_message_notify,
+			IRQ_TYPE_LEVEL_HIGH, "opal-msg", NULL);
 	if (ret) {
-		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		pr_err("%s: Can't request OPAL event irq (%d)\n",
 		       __func__, ret);
 		return ret;
 	}
+
 	return 0;
 }
-machine_early_initcall(powernv, opal_message_init);
 
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
@@ -573,7 +493,7 @@ int opal_handle_hmi_exception(struct pt_regs *regs)
 	local_paca->hmi_event_available = 0;
 	rc = opal_poll_events(&evt);
 	if (rc == OPAL_SUCCESS && evt)
-		opal_do_notifier(be64_to_cpu(evt));
+		opal_handle_events(be64_to_cpu(evt));
 
 	return 1;
 }
@@ -610,17 +530,6 @@ out:
 	return !!recover_addr;
 }
 
-static irqreturn_t opal_interrupt(int irq, void *data)
-{
-	__be64 events;
-
-	opal_handle_interrupt(virq_to_hw(irq), &events);
-
-	opal_do_notifier(be64_to_cpu(events));
-
-	return IRQ_HANDLED;
-}
-
 static int opal_sysfs_init(void)
 {
 	opal_kobj = kobject_create_and_add("opal", firmware_kobj);
@@ -693,21 +602,13 @@ static void __init opal_dump_region_init(void)
 			"rc = %d\n", rc);
 }
 
-static void opal_flash_init(struct device_node *opal_node)
-{
-	struct device_node *np;
-
-	for_each_child_of_node(opal_node, np)
-		if (of_device_is_compatible(np, "ibm,opal-flash"))
-			of_platform_device_create(np, NULL, NULL);
-}
-
-static void opal_ipmi_init(struct device_node *opal_node)
+static void opal_pdev_init(struct device_node *opal_node,
+		const char *compatible)
 {
 	struct device_node *np;
 
 	for_each_child_of_node(opal_node, np)
-		if (of_device_is_compatible(np, "ibm,opal-ipmi"))
+		if (of_device_is_compatible(np, compatible))
 			of_platform_device_create(np, NULL, NULL);
 }
 
@@ -719,52 +620,15 @@ static void opal_i2c_create_devs(void)
 		of_platform_device_create(np, NULL, NULL);
 }
 
-static void __init opal_irq_init(struct device_node *dn)
-{
-	const __be32 *irqs;
-	int i, irqlen;
-
-	/* Get interrupt property */
-	irqs = of_get_property(opal_node, "opal-interrupts", &irqlen);
-	opal_irq_count = irqs ? (irqlen / 4) : 0;
-	pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count);
-	if (!opal_irq_count)
-		return;
-
-	/* Install interrupt handlers */
-	opal_irqs = kzalloc(opal_irq_count * sizeof(unsigned int), GFP_KERNEL);
-	for (i = 0; irqs && i < opal_irq_count; i++, irqs++) {
-		unsigned int irq, virq;
-		int rc;
-
-		/* Get hardware and virtual IRQ */
-		irq = be32_to_cpup(irqs);
-		virq = irq_create_mapping(NULL, irq);
-		if (virq == NO_IRQ) {
-			pr_warn("Failed to map irq 0x%x\n", irq);
-			continue;
-		}
-
-		/* Install interrupt handler */
-		rc = request_irq(virq, opal_interrupt, 0, "opal", NULL);
-		if (rc) {
-			irq_dispose_mapping(virq);
-			pr_warn("Error %d requesting irq %d (0x%x)\n",
-				 rc, virq, irq);
-			continue;
-		}
-
-		/* Cache IRQ */
-		opal_irqs[i] = virq;
-	}
-}
-
 static int kopald(void *unused)
 {
+	__be64 events;
+
 	set_freezable();
 	do {
 		try_to_freeze();
-		opal_poll_events(NULL);
+		opal_poll_events(&events);
+		opal_handle_events(be64_to_cpu(events));
 		msleep_interruptible(opal_heartbeat);
 	} while (!kthread_should_stop());
 
@@ -807,15 +671,24 @@ static int __init opal_init(void)
 		of_node_put(consoles);
 	}
 
+	/* Initialise OPAL messaging system */
+	opal_message_init();
+
+	/* Initialise OPAL asynchronous completion interface */
+	opal_async_comp_init();
+
+	/* Initialise OPAL sensor interface */
+	opal_sensor_init();
+
+	/* Initialise OPAL hypervisor maintainence interrupt handling */
+	opal_hmi_handler_init();
+
 	/* Create i2c platform devices */
 	opal_i2c_create_devs();
 
 	/* Setup a heatbeat thread if requested by OPAL */
 	opal_init_heartbeat();
 
-	/* Find all OPAL interrupts and request them */
-	opal_irq_init(opal_node);
-
 	/* Create "opal" kobject under /sys/firmware */
 	rc = opal_sysfs_init();
 	if (rc == 0) {
@@ -835,10 +708,10 @@ static int __init opal_init(void)
 		opal_msglog_init();
 	}
 
-	/* Initialize OPAL IPMI backend */
-	opal_ipmi_init(opal_node);
-
-	opal_flash_init(opal_node);
+	/* Initialize platform devices: IPMI backend, PRD & flash interface */
+	opal_pdev_init(opal_node, "ibm,opal-ipmi");
+	opal_pdev_init(opal_node, "ibm,opal-flash");
+	opal_pdev_init(opal_node, "ibm,opal-prd");
 
 	return 0;
 }
@@ -846,15 +719,9 @@ machine_subsys_initcall(powernv, opal_init);
 
 void opal_shutdown(void)
 {
-	unsigned int i;
 	long rc = OPAL_BUSY;
 
-	/* First free interrupts, which will also mask them */
-	for (i = 0; i < opal_irq_count; i++) {
-		if (opal_irqs[i])
-			free_irq(opal_irqs[i], NULL);
-		opal_irqs[i] = 0;
-	}
+	opal_event_shutdown();
 
 	/*
 	 * Then sync with OPAL which ensure anything that can
@@ -876,11 +743,14 @@ void opal_shutdown(void)
 
 /* Export this so that test modules can use it */
 EXPORT_SYMBOL_GPL(opal_invalid_call);
+EXPORT_SYMBOL_GPL(opal_xscom_read);
+EXPORT_SYMBOL_GPL(opal_xscom_write);
 EXPORT_SYMBOL_GPL(opal_ipmi_send);
 EXPORT_SYMBOL_GPL(opal_ipmi_recv);
 EXPORT_SYMBOL_GPL(opal_flash_read);
 EXPORT_SYMBOL_GPL(opal_flash_write);
 EXPORT_SYMBOL_GPL(opal_flash_erase);
+EXPORT_SYMBOL_GPL(opal_prd_msg);
 
 /* Convert a region of vmalloc memory to an opal sg list */
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
@@ -954,6 +824,7 @@ int opal_error_code(int rc)
 	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
 	case OPAL_BUSY_EVENT:		return -EBUSY;
 	case OPAL_NO_MEM:		return -ENOMEM;
+	case OPAL_PERMISSION:		return -EPERM;
 
 	case OPAL_UNSUPPORTED:		return -EIO;
 	case OPAL_HARDWARE:		return -EIO;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f8bc950efcae..5738d315248b 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -23,6 +23,9 @@
 #include <linux/io.h>
 #include <linux/msi.h>
 #include <linux/memblock.h>
+#include <linux/iommu.h>
+#include <linux/rculist.h>
+#include <linux/sizes.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -38,8 +41,9 @@
 #include <asm/debug.h>
 #include <asm/firmware.h>
 #include <asm/pnv-pci.h>
+#include <asm/mmzone.h>
 
-#include <misc/cxl.h>
+#include <misc/cxl-base.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -47,6 +51,11 @@
 /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
 #define TCE32_TABLE_SIZE	((0x10000000 / 0x1000) * 8)
 
+#define POWERNV_IOMMU_DEFAULT_LEVELS	1
+#define POWERNV_IOMMU_MAX_LEVELS	5
+
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
 static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...)
 {
@@ -1086,10 +1095,6 @@ static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
 		return;
 	}
 
-	pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-			GFP_KERNEL, hose->node);
-	pe->tce32_table->data = pe;
-
 	/* Associate it with all child devices */
 	pnv_ioda_setup_same_PE(bus, pe);
 
@@ -1283,36 +1288,27 @@ m64_failed:
 	return -EBUSY;
 }
 
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num);
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+
 static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
 {
-	struct pci_bus        *bus;
-	struct pci_controller *hose;
-	struct pnv_phb        *phb;
 	struct iommu_table    *tbl;
-	unsigned long         addr;
 	int64_t               rc;
 
-	bus = dev->bus;
-	hose = pci_bus_to_host(bus);
-	phb = hose->private_data;
-	tbl = pe->tce32_table;
-	addr = tbl->it_base;
-
-	opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-				   pe->pe_number << 1, 1, __pa(addr),
-				   0, 0x1000);
-
-	rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
-				        pe->pe_number,
-				        (pe->pe_number << 1) + 1,
-				        pe->tce_bypass_base,
-				        0);
+	tbl = pe->table_group.tables[0];
+	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
 	if (rc)
 		pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
 
+	pnv_pci_ioda2_set_bypass(pe, false);
+	if (pe->table_group.group) {
+		iommu_group_put(pe->table_group.group);
+		BUG_ON(pe->table_group.group);
+	}
+	pnv_pci_ioda2_table_free_pages(tbl);
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
-	free_pages(addr, get_order(TCE32_TABLE_SIZE));
-	pe->tce32_table = NULL;
 }
 
 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
@@ -1460,10 +1456,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 			continue;
 		}
 
-		pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
-				GFP_KERNEL, hose->node);
-		pe->tce32_table->data = pe;
-
 		/* Put PE to the list */
 		mutex_lock(&phb->ioda.pe_list_mutex);
 		list_add_tail(&pe->list, &phb->ioda.pe_list);
@@ -1598,12 +1590,19 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
 	WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
-	set_iommu_table_base_and_group(&pdev->dev, pe->tce32_table);
+	set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
+	/*
+	 * Note: iommu_add_device() will fail here as
+	 * for physical PE: the device is already added by now;
+	 * for virtual PE: sysfs entries are not ready yet and
+	 * tce_iommu_bus_notifier will add the device to a group later.
+	 */
 }
 
-static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
-				     struct pci_dev *pdev, u64 dma_mask)
+static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
 {
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
 	struct pci_dn *pdn = pci_get_pdn(pdev);
 	struct pnv_ioda_pe *pe;
 	uint64_t top;
@@ -1625,7 +1624,7 @@ static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
 	} else {
 		dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
 		set_dma_ops(&pdev->dev, &dma_iommu_ops);
-		set_iommu_table_base(&pdev->dev, pe->tce32_table);
+		set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
 	}
 	*pdev->dev.dma_mask = dma_mask;
 	return 0;
@@ -1654,36 +1653,36 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-				   struct pci_bus *bus,
-				   bool add_to_iommu_group)
+				   struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		if (add_to_iommu_group)
-			set_iommu_table_base_and_group(&dev->dev,
-						       pe->tce32_table);
-		else
-			set_iommu_table_base(&dev->dev, pe->tce32_table);
+		set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+		iommu_add_device(&dev->dev);
 
-		if (dev->subordinate)
-			pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-					       add_to_iommu_group);
+		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
 	}
 }
 
-static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
-					 struct iommu_table *tbl,
-					 __be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
 {
+	struct iommu_table_group_link *tgl = list_first_entry_or_null(
+			&tbl->it_group_list, struct iommu_table_group_link,
+			next);
+	struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+			struct pnv_ioda_pe, table_group);
 	__be64 __iomem *invalidate = rm ?
-		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		(__be64 __iomem *)tbl->it_index;
+		(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
+		pe->phb->ioda.tce_inval_reg;
 	unsigned long start, end, inc;
 	const unsigned shift = tbl->it_page_shift;
 
-	start = __pa(startp);
-	end = __pa(endp);
+	start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
+	end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
+			npages - 1);
 
 	/* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
 	if (tbl->it_busno) {
@@ -1719,26 +1718,79 @@ static void pnv_pci_ioda1_tce_invalidate(struct pnv_ioda_pe *pe,
 	 */
 }
 
-static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
-					 struct iommu_table *tbl,
-					 __be64 *startp, __be64 *endp, bool rm)
+static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
+
+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret && (tbl->it_type &
+			(TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+		pnv_pci_ioda1_tce_invalidate(tbl, index, 1, false);
+
+	return ret;
+}
+#endif
+
+static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda1_tce_invalidate(tbl, index, npages, false);
+}
+
+static struct iommu_table_ops pnv_ioda1_iommu_ops = {
+	.set = pnv_ioda1_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_ioda1_tce_xchg,
+#endif
+	.clear = pnv_ioda1_tce_free,
+	.get = pnv_tce_get,
+};
+
+static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe)
+{
+	/* 01xb - invalidate TCEs that match the specified PE# */
+	unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF);
+	struct pnv_phb *phb = pe->phb;
+
+	if (!phb->ioda.tce_inval_reg)
+		return;
+
+	mb(); /* Ensure above stores are visible */
+	__raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg);
+}
+
+static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm,
+		__be64 __iomem *invalidate, unsigned shift,
+		unsigned long index, unsigned long npages)
 {
 	unsigned long start, end, inc;
-	__be64 __iomem *invalidate = rm ?
-		(__be64 __iomem *)pe->tce_inval_reg_phys :
-		(__be64 __iomem *)tbl->it_index;
-	const unsigned shift = tbl->it_page_shift;
 
 	/* We'll invalidate DMA address in PE scope */
 	start = 0x2ull << 60;
-	start |= (pe->pe_number & 0xFF);
+	start |= (pe_number & 0xFF);
 	end = start;
 
 	/* Figure out the start, end and step */
-	inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
-	start |= (inc << shift);
-	inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
-	end |= (inc << shift);
+	start |= (index << shift);
+	end |= ((index + npages - 1) << shift);
 	inc = (0x1ull << shift);
 	mb();
 
@@ -1751,25 +1803,83 @@ static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 	}
 }
 
-void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
-				 __be64 *startp, __be64 *endp, bool rm)
+static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
+		unsigned long index, unsigned long npages, bool rm)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
-	struct pnv_phb *phb = pe->phb;
+	struct iommu_table_group_link *tgl;
 
-	if (phb->type == PNV_PHB_IODA1)
-		pnv_pci_ioda1_tce_invalidate(pe, tbl, startp, endp, rm);
-	else
-		pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp, rm);
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		struct pnv_ioda_pe *pe = container_of(tgl->table_group,
+				struct pnv_ioda_pe, table_group);
+		__be64 __iomem *invalidate = rm ?
+			(__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys :
+			pe->phb->ioda.tce_inval_reg;
+
+		pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm,
+			invalidate, tbl->it_page_shift,
+			index, npages);
+	}
+}
+
+static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
+			attrs);
+
+	if (!ret && (tbl->it_type & TCE_PCI_SWINV_CREATE))
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+
+	return ret;
+}
+
+#ifdef CONFIG_IOMMU_API
+static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
+{
+	long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+	if (!ret && (tbl->it_type &
+			(TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE)))
+		pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
+
+	return ret;
+}
+#endif
+
+static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
+		long npages)
+{
+	pnv_tce_free(tbl, index, npages);
+
+	if (tbl->it_type & TCE_PCI_SWINV_FREE)
+		pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+}
+
+static void pnv_ioda2_table_free(struct iommu_table *tbl)
+{
+	pnv_pci_ioda2_table_free_pages(tbl);
+	iommu_free_table(tbl, "pnv");
 }
 
+static struct iommu_table_ops pnv_ioda2_iommu_ops = {
+	.set = pnv_ioda2_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_ioda2_tce_xchg,
+#endif
+	.clear = pnv_ioda2_tce_free,
+	.get = pnv_tce_get,
+	.free = pnv_ioda2_table_free,
+};
+
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 				      struct pnv_ioda_pe *pe, unsigned int base,
 				      unsigned int segs)
 {
 
 	struct page *tce_mem = NULL;
-	const __be64 *swinvp;
 	struct iommu_table *tbl;
 	unsigned int i;
 	int64_t rc;
@@ -1783,6 +1893,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	if (WARN_ON(pe->tce32_seg >= 0))
 		return;
 
+	tbl = pnv_pci_table_alloc(phb->hose->node);
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
+	pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
+
 	/* Grab a 32-bit TCE table */
 	pe->tce32_seg = base;
 	pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
@@ -1817,39 +1932,30 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	}
 
 	/* Setup linux iommu table */
-	tbl = pe->tce32_table;
 	pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs,
 				  base << 28, IOMMU_PAGE_SHIFT_4K);
 
 	/* OPAL variant of P7IOC SW invalidated TCEs */
-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-	if (swinvp) {
-		/* We need a couple more fields -- an address and a data
-		 * to or.  Since the bus is only printed out on table free
-		 * errors, and on the first pass the data will be a relative
-		 * bus number, print that out instead.
-		 */
-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
-				8);
+	if (phb->ioda.tce_inval_reg)
 		tbl->it_type |= (TCE_PCI_SWINV_CREATE |
 				 TCE_PCI_SWINV_FREE   |
 				 TCE_PCI_SWINV_PAIR);
-	}
+
+	tbl->it_ops = &pnv_ioda1_iommu_ops;
+	pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
+	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->flags & PNV_IODA_PE_DEV) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
-	} else if (pe->flags & PNV_IODA_PE_VF) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-	}
+		/*
+		 * Setting table base here only for carrying iommu_group
+		 * further down to let iommu_add_device() do the job.
+		 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+		 */
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+		iommu_add_device(&pe->pdev->dev);
+	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
 	return;
  fail:
@@ -1858,11 +1964,53 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 		pe->tce32_seg = -1;
 	if (tce_mem)
 		__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
+	if (tbl) {
+		pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
+		iommu_free_table(tbl, "pnv");
+	}
 }
 
-static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+		int num, struct iommu_table *tbl)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	int64_t rc;
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+	const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+	const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+	pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
+			start_addr, start_addr + win_size - 1,
+			IOMMU_PAGE_SIZE(tbl));
+
+	/*
+	 * Map TCE table through TVT. The TVE index is the PE number
+	 * shifted by 1 bit for 32-bits DMA space.
+	 */
+	rc = opal_pci_map_pe_dma_window(phb->opal_id,
+			pe->pe_number,
+			(pe->pe_number << 1) + num,
+			tbl->it_indirect_levels + 1,
+			__pa(tbl->it_base),
+			size << 3,
+			IOMMU_PAGE_SIZE(tbl));
+	if (rc) {
+		pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+		return rc;
+	}
+
+	pnv_pci_link_table_and_group(phb->hose->node, num,
+			tbl, &pe->table_group);
+	pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+	return 0;
+}
+
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
 {
-	struct pnv_ioda_pe *pe = tbl->data;
 	uint16_t window_id = (pe->pe_number << 1 ) + 1;
 	int64_t rc;
 
@@ -1882,17 +2030,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 						     window_id,
 						     pe->tce_bypass_base,
 						     0);
-
-		/*
-		 * EEH needs the mapping between IOMMU table and group
-		 * of those VFIO/KVM pass-through devices. We can postpone
-		 * resetting DMA ops until the DMA mask is configured in
-		 * host side.
-		 */
-		if (pe->pdev)
-			set_iommu_table_base(&pe->pdev->dev, tbl);
-		else
-			pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
 	}
 	if (rc)
 		pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
@@ -1900,106 +2037,363 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
 		pe->tce_bypass_enabled = enable;
 }
 
-static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
-					  struct pnv_ioda_pe *pe)
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl);
+
+static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table **ptbl)
 {
-	/* TVE #1 is selected by PCI address bit 59 */
-	pe->tce_bypass_base = 1ull << 59;
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	int nid = pe->phb->hose->node;
+	__u64 bus_offset = num ? pe->tce_bypass_base : table_group->tce32_start;
+	long ret;
+	struct iommu_table *tbl;
 
-	/* Install set_bypass callback for VFIO */
-	pe->tce32_table->set_bypass = pnv_pci_ioda2_set_bypass;
+	tbl = pnv_pci_table_alloc(nid);
+	if (!tbl)
+		return -ENOMEM;
 
-	/* Enable bypass by default */
-	pnv_pci_ioda2_set_bypass(pe->tce32_table, true);
+	ret = pnv_pci_ioda2_table_alloc_pages(nid,
+			bus_offset, page_shift, window_size,
+			levels, tbl);
+	if (ret) {
+		iommu_free_table(tbl, "pnv");
+		return ret;
+	}
+
+	tbl->it_ops = &pnv_ioda2_iommu_ops;
+	if (pe->phb->ioda.tce_inval_reg)
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+	*ptbl = tbl;
+
+	return 0;
 }
 
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
-				       struct pnv_ioda_pe *pe)
+static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
+{
+	struct iommu_table *tbl = NULL;
+	long rc;
+
+	rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
+			IOMMU_PAGE_SHIFT_4K,
+			pe->table_group.tce32_size,
+			POWERNV_IOMMU_DEFAULT_LEVELS, &tbl);
+	if (rc) {
+		pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
+				rc);
+		return rc;
+	}
+
+	iommu_init_table(tbl, pe->phb->hose->node);
+
+	rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+	if (rc) {
+		pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
+				rc);
+		pnv_ioda2_table_free(tbl);
+		return rc;
+	}
+
+	if (!pnv_iommu_bypass_disabled)
+		pnv_pci_ioda2_set_bypass(pe, true);
+
+	/* OPAL variant of PHB3 invalidated TCEs */
+	if (pe->phb->ioda.tce_inval_reg)
+		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+
+	/*
+	 * Setting table base here only for carrying iommu_group
+	 * further down to let iommu_add_device() do the job.
+	 * pnv_pci_ioda_dma_dev_setup will override it later anyway.
+	 */
+	if (pe->flags & PNV_IODA_PE_DEV)
+		set_iommu_table_base(&pe->pdev->dev, tbl);
+
+	return 0;
+}
+
+#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
+static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
+		int num)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+			table_group);
+	struct pnv_phb *phb = pe->phb;
+	long ret;
+
+	pe_info(pe, "Removing DMA window #%d\n", num);
+
+	ret = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+			(pe->pe_number << 1) + num,
+			0/* levels */, 0/* table address */,
+			0/* table size */, 0/* page size */);
+	if (ret)
+		pe_warn(pe, "Unmapping failed, ret = %ld\n", ret);
+	else
+		pnv_pci_ioda2_tce_invalidate_entire(pe);
+
+	pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
+
+	return ret;
+}
+#endif
+
+#ifdef CONFIG_IOMMU_API
+static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+		__u64 window_size, __u32 levels)
+{
+	unsigned long bytes = 0;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = entries_shift + 3;
+	unsigned long tce_table_size = max(0x1000UL, 1UL << table_shift);
+	unsigned long direct_table_size;
+
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
+			(window_size > memory_hotplug_max()) ||
+			!is_power_of_2(window_size))
+		return 0;
+
+	/* Calculate a direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	table_shift = entries_shift + 3;
+	table_shift = max_t(unsigned, table_shift, PAGE_SHIFT);
+	direct_table_size =  1UL << table_shift;
+
+	for ( ; levels; --levels) {
+		bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+
+		tce_table_size /= direct_table_size;
+		tce_table_size <<= 3;
+		tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+	}
+
+	return bytes;
+}
+
+static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+	/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
+	struct iommu_table *tbl = pe->table_group.tables[0];
+
+	pnv_pci_ioda2_set_bypass(pe, false);
+	pnv_pci_ioda2_unset_window(&pe->table_group, 0);
+	pnv_ioda2_table_free(tbl);
+}
+
+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
+{
+	struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
+						table_group);
+
+	pnv_pci_ioda2_setup_default_config(pe);
+}
+
+static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+	.get_table_size = pnv_pci_ioda2_get_table_size,
+	.create_table = pnv_pci_ioda2_create_table,
+	.set_window = pnv_pci_ioda2_set_window,
+	.unset_window = pnv_pci_ioda2_unset_window,
+	.take_ownership = pnv_ioda2_take_ownership,
+	.release_ownership = pnv_ioda2_release_ownership,
+};
+#endif
+
+static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
 {
-	struct page *tce_mem = NULL;
-	void *addr;
 	const __be64 *swinvp;
-	struct iommu_table *tbl;
-	unsigned int tce_table_size, end;
-	int64_t rc;
 
-	/* We shouldn't already have a 32-bit DMA associated */
-	if (WARN_ON(pe->tce32_seg >= 0))
+	/* OPAL variant of PHB3 invalidated TCEs */
+	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+	if (!swinvp)
 		return;
 
-	/* The PE will reserve all possible 32-bits space */
-	pe->tce32_seg = 0;
-	end = (1 << ilog2(phb->ioda.m32_pci_base));
-	tce_table_size = (end / 0x1000) * 8;
-	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
-		end);
+	phb->ioda.tce_inval_reg_phys = be64_to_cpup(swinvp);
+	phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
+}
 
-	/* Allocate TCE table */
-	tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
-				   get_order(tce_table_size));
+static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
+		unsigned levels, unsigned long limit,
+		unsigned long *current_offset)
+{
+	struct page *tce_mem = NULL;
+	__be64 *addr, *tmp;
+	unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
+	unsigned long allocated = 1UL << (order + PAGE_SHIFT);
+	unsigned entries = 1UL << (shift - 3);
+	long i;
+
+	tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
 	if (!tce_mem) {
-		pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
-		goto fail;
+		pr_err("Failed to allocate a TCE memory, order=%d\n", order);
+		return NULL;
 	}
 	addr = page_address(tce_mem);
-	memset(addr, 0, tce_table_size);
+	memset(addr, 0, allocated);
+
+	--levels;
+	if (!levels) {
+		*current_offset += allocated;
+		return addr;
+	}
+
+	for (i = 0; i < entries; ++i) {
+		tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
+				levels, limit, current_offset);
+		if (!tmp)
+			break;
+
+		addr[i] = cpu_to_be64(__pa(tmp) |
+				TCE_PCI_READ | TCE_PCI_WRITE);
+
+		if (*current_offset >= limit)
+			break;
+	}
+
+	return addr;
+}
+
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned level);
+
+static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+		__u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl)
+{
+	void *addr;
+	unsigned long offset = 0, level_shift;
+	const unsigned window_shift = ilog2(window_size);
+	unsigned entries_shift = window_shift - page_shift;
+	unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
+	const unsigned long tce_table_size = 1UL << table_shift;
+
+	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
+		return -EINVAL;
+
+	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+		return -EINVAL;
+
+	/* Adjust direct table size from window_size and levels */
+	entries_shift = (entries_shift + levels - 1) / levels;
+	level_shift = entries_shift + 3;
+	level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
+
+	/* Allocate TCE table */
+	addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
+			levels, tce_table_size, &offset);
+
+	/* addr==NULL means that the first level allocation failed */
+	if (!addr)
+		return -ENOMEM;
 
 	/*
-	 * Map TCE table through TVT. The TVE index is the PE number
-	 * shifted by 1 bit for 32-bits DMA space.
+	 * First level was allocated but some lower level failed as
+	 * we did not allocate as much as we wanted,
+	 * release partially allocated table.
 	 */
-	rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-					pe->pe_number << 1, 1, __pa(addr),
-					tce_table_size, 0x1000);
-	if (rc) {
-		pe_err(pe, "Failed to configure 32-bit TCE table,"
-		       " err %ld\n", rc);
-		goto fail;
+	if (offset < tce_table_size) {
+		pnv_pci_ioda2_table_do_free_pages(addr,
+				1ULL << (level_shift - 3), levels - 1);
+		return -ENOMEM;
 	}
 
 	/* Setup linux iommu table */
-	tbl = pe->tce32_table;
-	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0,
-			IOMMU_PAGE_SHIFT_4K);
+	pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+			page_shift);
+	tbl->it_level_size = 1ULL << (level_shift - 3);
+	tbl->it_indirect_levels = levels - 1;
+	tbl->it_allocated_size = offset;
 
-	/* OPAL variant of PHB3 invalidated TCEs */
-	swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
-	if (swinvp) {
-		/* We need a couple more fields -- an address and a data
-		 * to or.  Since the bus is only printed out on table free
-		 * errors, and on the first pass the data will be a relative
-		 * bus number, print that out instead.
-		 */
-		pe->tce_inval_reg_phys = be64_to_cpup(swinvp);
-		tbl->it_index = (unsigned long)ioremap(pe->tce_inval_reg_phys,
-				8);
-		tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+	pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
+			window_size, tce_table_size, bus_offset);
+
+	return 0;
+}
+
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+		unsigned long size, unsigned level)
+{
+	const unsigned long addr_ul = (unsigned long) addr &
+			~(TCE_PCI_READ | TCE_PCI_WRITE);
+
+	if (level) {
+		long i;
+		u64 *tmp = (u64 *) addr_ul;
+
+		for (i = 0; i < size; ++i) {
+			unsigned long hpa = be64_to_cpu(tmp[i]);
+
+			if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
+				continue;
+
+			pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
+					level - 1);
+		}
 	}
-	iommu_init_table(tbl, phb->hose->node);
 
-	if (pe->flags & PNV_IODA_PE_DEV) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
-	} else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-		pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
-	} else if (pe->flags & PNV_IODA_PE_VF) {
-		iommu_register_group(tbl, phb->hose->global_number,
-				     pe->pe_number);
-	}
-
-	/* Also create a bypass window */
-	if (!pnv_iommu_bypass_disabled)
-		pnv_pci_ioda2_setup_bypass_pe(phb, pe);
+	free_pages(addr_ul, get_order(size << 3));
+}
 
-	return;
-fail:
-	if (pe->tce32_seg >= 0)
-		pe->tce32_seg = -1;
-	if (tce_mem)
-		__free_pages(tce_mem, get_order(tce_table_size));
+static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+{
+	const unsigned long size = tbl->it_indirect_levels ?
+			tbl->it_level_size : tbl->it_size;
+
+	if (!tbl->it_size)
+		return;
+
+	pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
+			tbl->it_indirect_levels);
+}
+
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+				       struct pnv_ioda_pe *pe)
+{
+	int64_t rc;
+
+	/* We shouldn't already have a 32-bit DMA associated */
+	if (WARN_ON(pe->tce32_seg >= 0))
+		return;
+
+	/* TVE #1 is selected by PCI address bit 59 */
+	pe->tce_bypass_base = 1ull << 59;
+
+	iommu_register_group(&pe->table_group, phb->hose->global_number,
+			pe->pe_number);
+
+	/* The PE will reserve all possible 32-bits space */
+	pe->tce32_seg = 0;
+	pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
+		phb->ioda.m32_pci_base);
+
+	/* Setup linux iommu table */
+	pe->table_group.tce32_start = 0;
+	pe->table_group.tce32_size = phb->ioda.m32_pci_base;
+	pe->table_group.max_dynamic_windows_supported =
+			IOMMU_TABLE_GROUP_MAX_TABLES;
+	pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
+	pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M;
+#ifdef CONFIG_IOMMU_API
+	pe->table_group.ops = &pnv_pci_ioda2_ops;
+#endif
+
+	rc = pnv_pci_ioda2_setup_default_config(pe);
+	if (rc) {
+		if (pe->tce32_seg >= 0)
+			pe->tce32_seg = -1;
+		return;
+	}
+
+	if (pe->flags & PNV_IODA_PE_DEV)
+		iommu_add_device(&pe->pdev->dev);
+	else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
 
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
@@ -2024,6 +2418,8 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
 	pr_info("PCI: %d PE# for a total weight of %d\n",
 		phb->ioda.dma_pe_count, phb->ioda.dma_weight);
 
+	pnv_pci_ioda_setup_opal_tce_kill(phb);
+
 	/* Walk our PE list and configure their DMA segments, hand them
 	 * out one base segment plus any residual segments based on
 	 * weight
@@ -2642,12 +3038,27 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
 	return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
 }
 
-static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
+static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
+	struct pnv_phb *phb = hose->private_data;
+
 	opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
 		       OPAL_ASSERT_RESET);
 }
 
+static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
+       .dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+       .setup_msi_irqs = pnv_setup_msi_irqs,
+       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+       .enable_device_hook = pnv_pci_enable_device_hook,
+       .window_alignment = pnv_pci_window_alignment,
+       .reset_secondary_bus = pnv_pci_reset_secondary_bus,
+       .dma_set_mask = pnv_pci_ioda_dma_set_mask,
+       .shutdown = pnv_pci_ioda_shutdown,
+};
+
 static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 					 u64 hub_id, int ioda_type)
 {
@@ -2791,12 +3202,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 
 	/* Setup TCEs */
 	phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
-	phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
 	phb->dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask;
 
-	/* Setup shutdown function for kexec */
-	phb->shutdown = pnv_pci_ioda_shutdown;
-
 	/* Setup MSI support */
 	pnv_pci_init_ioda_msis(phb);
 
@@ -2808,10 +3215,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	 * the child P2P bridges) can form individual PE.
 	 */
 	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
-	pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook;
-	pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
-	pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
-	hose->controller_ops = pnv_pci_controller_ops;
+	hose->controller_ops = pnv_pci_ioda_controller_ops;
 
 #ifdef CONFIG_PCI_IOV
 	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index 4729ca793813..f2bdfea3b68d 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -83,18 +83,42 @@ static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
 static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
 #endif /* CONFIG_PCI_MSI */
 
+static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
+	.set = pnv_tce_build,
+#ifdef CONFIG_IOMMU_API
+	.exchange = pnv_tce_xchg,
+#endif
+	.clear = pnv_tce_free,
+	.get = pnv_tce_get,
+};
+
 static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 					 struct pci_dev *pdev)
 {
-	if (phb->p5ioc2.iommu_table.it_map == NULL) {
-		iommu_init_table(&phb->p5ioc2.iommu_table, phb->hose->node);
-		iommu_register_group(&phb->p5ioc2.iommu_table,
+	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
+
+	if (!tbl->it_map) {
+		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
+		iommu_init_table(tbl, phb->hose->node);
+		iommu_register_group(&phb->p5ioc2.table_group,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
+		INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+		pnv_pci_link_table_and_group(phb->hose->node, 0,
+				tbl, &phb->p5ioc2.table_group);
 	}
 
-	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
+	set_iommu_table_base(&pdev->dev, tbl);
+	iommu_add_device(&pdev->dev);
 }
 
+static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = {
+	.dma_dev_setup = pnv_pci_dma_dev_setup,
+#ifdef CONFIG_PCI_MSI
+       .setup_msi_irqs = pnv_setup_msi_irqs,
+       .teardown_msi_irqs = pnv_teardown_msi_irqs,
+#endif
+};
+
 static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 					   void *tce_mem, u64 tce_size)
 {
@@ -103,6 +127,8 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	u64 phb_id;
 	int64_t rc;
 	static int primary = 1;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
 
 	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
 
@@ -133,7 +159,7 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	phb->hose->first_busno = 0;
 	phb->hose->last_busno = 0xff;
 	phb->hose->private_data = phb;
-	phb->hose->controller_ops = pnv_pci_controller_ops;
+	phb->hose->controller_ops = pnv_pci_p5ioc2_controller_ops;
 	phb->hub_id = hub_id;
 	phb->opal_id = phb_id;
 	phb->type = PNV_PHB_P5IOC2;
@@ -172,6 +198,15 @@ static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
 	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
 				  tce_mem, tce_size, 0,
 				  IOMMU_PAGE_SHIFT_4K);
+	/*
+	 * We do not allocate iommu_table as we do not support
+	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
+	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
+	 */
+	tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
+	table_group = &phb->p5ioc2.table_group;
+	table_group->tce32_start = tbl->it_offset << tbl->it_page_shift;
+	table_group->tce32_size = tbl->it_size << tbl->it_page_shift;
 }
 
 void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index bca2aeb6e4b6..765d8ed558d0 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -45,7 +45,7 @@
 //#define cfg_dbg(fmt...)	printk(fmt)
 
 #ifdef CONFIG_PCI_MSI
-static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -94,7 +94,7 @@ static int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 	return 0;
 }
 
-static void pnv_teardown_msi_irqs(struct pci_dev *pdev)
+void pnv_teardown_msi_irqs(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -572,80 +572,152 @@ struct pci_ops pnv_pci_ops = {
 	.write = pnv_pci_write_config,
 };
 
-static int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
-			 unsigned long uaddr, enum dma_data_direction direction,
-			 struct dma_attrs *attrs, bool rm)
+static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
 {
-	u64 proto_tce;
-	__be64 *tcep, *tces;
-	u64 rpn;
-
-	proto_tce = TCE_PCI_READ; // Read allowed
+	__be64 *tmp = ((__be64 *)tbl->it_base);
+	int  level = tbl->it_indirect_levels;
+	const long shift = ilog2(tbl->it_level_size);
+	unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
+
+	while (level) {
+		int n = (idx & mask) >> (level * shift);
+		unsigned long tce = be64_to_cpu(tmp[n]);
+
+		tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
+		idx &= ~mask;
+		mask >>= shift;
+		--level;
+	}
 
-	if (direction != DMA_TO_DEVICE)
-		proto_tce |= TCE_PCI_WRITE;
+	return tmp + idx;
+}
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
-	rpn = __pa(uaddr) >> tbl->it_page_shift;
+int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		struct dma_attrs *attrs)
+{
+	u64 proto_tce = iommu_direction_to_tce_perm(direction);
+	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
+	long i;
 
-	while (npages--)
-		*(tcep++) = cpu_to_be64(proto_tce |
-				(rpn++ << tbl->it_page_shift));
+	for (i = 0; i < npages; i++) {
+		unsigned long newtce = proto_tce |
+			((rpn + i) << tbl->it_page_shift);
+		unsigned long idx = index - tbl->it_offset + i;
 
-	/* Some implementations won't cache invalid TCEs and thus may not
-	 * need that flush. We'll probably turn it_type into a bit mask
-	 * of flags if that becomes the case
-	 */
-	if (tbl->it_type & TCE_PCI_SWINV_CREATE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+		*(pnv_tce(tbl, idx)) = cpu_to_be64(newtce);
+	}
 
 	return 0;
 }
 
-static int pnv_tce_build_vm(struct iommu_table *tbl, long index, long npages,
-			    unsigned long uaddr,
-			    enum dma_data_direction direction,
-			    struct dma_attrs *attrs)
+#ifdef CONFIG_IOMMU_API
+int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction)
 {
-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs,
-			false);
+	u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+	unsigned long newtce = *hpa | proto_tce, oldtce;
+	unsigned long idx = index - tbl->it_offset;
+
+	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
+
+	oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
+	*hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+	*direction = iommu_tce_direction(oldtce);
+
+	return 0;
 }
+#endif
 
-static void pnv_tce_free(struct iommu_table *tbl, long index, long npages,
-		bool rm)
+void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
 {
-	__be64 *tcep, *tces;
+	long i;
 
-	tces = tcep = ((__be64 *)tbl->it_base) + index - tbl->it_offset;
+	for (i = 0; i < npages; i++) {
+		unsigned long idx = index - tbl->it_offset + i;
 
-	while (npages--)
-		*(tcep++) = cpu_to_be64(0);
+		*(pnv_tce(tbl, idx)) = cpu_to_be64(0);
+	}
+}
 
-	if (tbl->it_type & TCE_PCI_SWINV_FREE)
-		pnv_pci_ioda_tce_invalidate(tbl, tces, tcep - 1, rm);
+unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+{
+	return *(pnv_tce(tbl, index - tbl->it_offset));
 }
 
-static void pnv_tce_free_vm(struct iommu_table *tbl, long index, long npages)
+struct iommu_table *pnv_pci_table_alloc(int nid)
 {
-	pnv_tce_free(tbl, index, npages, false);
+	struct iommu_table *tbl;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+
+	return tbl;
 }
 
-static unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
+long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
 {
-	return ((u64 *)tbl->it_base)[index - tbl->it_offset];
+	struct iommu_table_group_link *tgl = NULL;
+
+	if (WARN_ON(!tbl || !table_group))
+		return -EINVAL;
+
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		return -ENOMEM;
+
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+	table_group->tables[num] = tbl;
+
+	return 0;
 }
 
-static int pnv_tce_build_rm(struct iommu_table *tbl, long index, long npages,
-			    unsigned long uaddr,
-			    enum dma_data_direction direction,
-			    struct dma_attrs *attrs)
+static void pnv_iommu_table_group_link_free(struct rcu_head *head)
 {
-	return pnv_tce_build(tbl, index, npages, uaddr, direction, attrs, true);
+	struct iommu_table_group_link *tgl = container_of(head,
+			struct iommu_table_group_link, rcu);
+
+	kfree(tgl);
 }
 
-static void pnv_tce_free_rm(struct iommu_table *tbl, long index, long npages)
+void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group)
 {
-	pnv_tce_free(tbl, index, npages, true);
+	long i;
+	bool found;
+	struct iommu_table_group_link *tgl;
+
+	if (!tbl || !table_group)
+		return;
+
+	/* Remove link to a group from table's list of attached groups */
+	found = false;
+	list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+		if (tgl->table_group == table_group) {
+			list_del_rcu(&tgl->next);
+			call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free);
+			found = true;
+			break;
+		}
+	}
+	if (WARN_ON(!found))
+		return;
+
+	/* Clean a pointer to iommu_table in iommu_table_group::tables[] */
+	found = false;
+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+		if (table_group->tables[i] == tbl) {
+			table_group->tables[i] = NULL;
+			found = true;
+			break;
+		}
+	}
+	WARN_ON(!found);
 }
 
 void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
@@ -662,7 +734,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 	tbl->it_type = TCE_PCI;
 }
 
-static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
+void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
 	struct pnv_phb *phb = hose->private_data;
@@ -689,16 +761,6 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 		phb->dma_dev_setup(phb, pdev);
 }
 
-int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
-{
-	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
-	struct pnv_phb *phb = hose->private_data;
-
-	if (phb && phb->dma_set_mask)
-		return phb->dma_set_mask(phb, pdev, dma_mask);
-	return __dma_set_mask(&pdev->dev, dma_mask);
-}
-
 u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
 {
 	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
@@ -714,12 +776,9 @@ void pnv_pci_shutdown(void)
 {
 	struct pci_controller *hose;
 
-	list_for_each_entry(hose, &hose_list, list_node) {
-		struct pnv_phb *phb = hose->private_data;
-
-		if (phb && phb->shutdown)
-			phb->shutdown(phb);
-	}
+	list_for_each_entry(hose, &hose_list, list_node)
+		if (hose->controller_ops.shutdown)
+			hose->controller_ops.shutdown(hose);
 }
 
 /* Fixup wrong class code in p7ioc and p8 root complex */
@@ -762,22 +821,7 @@ void __init pnv_pci_init(void)
 	pci_devs_phb_init();
 
 	/* Configure IOMMU DMA hooks */
-	ppc_md.tce_build = pnv_tce_build_vm;
-	ppc_md.tce_free = pnv_tce_free_vm;
-	ppc_md.tce_build_rm = pnv_tce_build_rm;
-	ppc_md.tce_free_rm = pnv_tce_free_rm;
-	ppc_md.tce_get = pnv_tce_get;
 	set_pci_dma_ops(&dma_iommu_ops);
-
-	/* Configure MSIs */
-#ifdef CONFIG_PCI_MSI
-	ppc_md.setup_msi_irqs = pnv_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
-#endif
 }
 
 machine_subsys_initcall_sync(powernv, tce_iommu_bus_notifier_init);
-
-struct pci_controller_ops pnv_pci_controller_ops = {
-	.dma_dev_setup = pnv_pci_dma_dev_setup,
-};
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 070ee888fc95..8ef2d28aded0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -57,8 +57,7 @@ struct pnv_ioda_pe {
 	/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
 	int			tce32_seg;
 	int			tce32_segcount;
-	struct iommu_table	*tce32_table;
-	phys_addr_t		tce_inval_reg_phys;
+	struct iommu_table_group table_group;
 
 	/* 64-bit TCE bypass region */
 	bool			tce_bypass_enabled;
@@ -106,13 +105,10 @@ struct pnv_phb {
 			 unsigned int hwirq, unsigned int virq,
 			 unsigned int is_64, struct msi_msg *msg);
 	void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
-	int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev,
-			    u64 dma_mask);
 	u64 (*dma_get_required_mask)(struct pnv_phb *phb,
 				     struct pci_dev *pdev);
 	void (*fixup_phb)(struct pci_controller *hose);
 	u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
-	void (*shutdown)(struct pnv_phb *phb);
 	int (*init_m64)(struct pnv_phb *phb);
 	void (*reserve_m64_pe)(struct pnv_phb *phb);
 	int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
@@ -123,6 +119,7 @@ struct pnv_phb {
 	union {
 		struct {
 			struct iommu_table iommu_table;
+			struct iommu_table_group table_group;
 		} p5ioc2;
 
 		struct {
@@ -186,6 +183,12 @@ struct pnv_phb {
 			 * boot for resource allocation purposes
 			 */
 			struct list_head	pe_dma_list;
+
+			/* TCE cache invalidate registers (physical and
+			 * remapped)
+			 */
+			phys_addr_t		tce_inval_reg_phys;
+			__be64 __iomem		*tce_inval_reg;
 		} ioda;
 	};
 
@@ -200,6 +203,13 @@ struct pnv_phb {
 };
 
 extern struct pci_ops pnv_pci_ops;
+extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
+		unsigned long uaddr, enum dma_data_direction direction,
+		struct dma_attrs *attrs);
+extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
+extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
+		unsigned long *hpa, enum dma_data_direction *direction);
+extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
 
 void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
 				unsigned char *log_buff);
@@ -207,6 +217,13 @@ int pnv_pci_cfg_read(struct pci_dn *pdn,
 		     int where, int size, u32 *val);
 int pnv_pci_cfg_write(struct pci_dn *pdn,
 		      int where, int size, u32 val);
+extern struct iommu_table *pnv_pci_table_alloc(int nid);
+
+extern long pnv_pci_link_table_and_group(int node, int num,
+		struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
+extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
+		struct iommu_table_group *table_group);
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
 				      u64 dma_offset, unsigned page_shift);
@@ -218,4 +235,8 @@ extern void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
+extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
+extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+
 #endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 826d2c9bea56..9269e30e4ca0 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -12,29 +12,24 @@ struct pci_dev;
 #ifdef CONFIG_PCI
 extern void pnv_pci_init(void);
 extern void pnv_pci_shutdown(void);
-extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask);
 extern u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev);
 #else
 static inline void pnv_pci_init(void) { }
 static inline void pnv_pci_shutdown(void) { }
 
-static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
-{
-	return -ENODEV;
-}
-
 static inline u64 pnv_pci_dma_get_required_mask(struct pci_dev *pdev)
 {
 	return 0;
 }
 #endif
 
-extern struct pci_controller_ops pnv_pci_controller_ops;
-
 extern u32 pnv_get_supported_cpuidle_states(void);
 
 extern void pnv_lpc_init(void);
 
+extern void opal_handle_events(uint64_t events);
+extern void opal_event_shutdown(void);
+
 bool cpu_core_split_required(void);
 
 #endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 16fdcb23f4c3..53737e019ae3 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -35,12 +35,8 @@
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
-#include <asm/cputhreads.h>
-#include <asm/cpuidle.h>
-#include <asm/code-patching.h>
 
 #include "powernv.h"
-#include "subcore.h"
 
 static void __init pnv_setup_arch(void)
 {
@@ -111,7 +107,7 @@ static void pnv_prepare_going_down(void)
 	 * Disable all notifiers from OPAL, we can't
 	 * service interrupts anymore anyway
 	 */
-	opal_notifier_disable();
+	opal_event_shutdown();
 
 	/* Soft disable interrupts */
 	local_irq_disable();
@@ -169,13 +165,6 @@ static void pnv_progress(char *s, unsigned short hex)
 {
 }
 
-static int pnv_dma_set_mask(struct device *dev, u64 dma_mask)
-{
-	if (dev_is_pci(dev))
-		return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask);
-	return __dma_set_mask(dev, dma_mask);
-}
-
 static u64 pnv_dma_get_required_mask(struct device *dev)
 {
 	if (dev_is_pci(dev))
@@ -277,173 +266,6 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
 }
 
-static u32 supported_cpuidle_states;
-
-int pnv_save_sprs_for_winkle(void)
-{
-	int cpu;
-	int rc;
-
-	/*
-	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross
-	 * all cpus at boot. Get these reg values of current cpu and use the
-	 * same accross all cpus.
-	 */
-	uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
-	uint64_t hid0_val = mfspr(SPRN_HID0);
-	uint64_t hid1_val = mfspr(SPRN_HID1);
-	uint64_t hid4_val = mfspr(SPRN_HID4);
-	uint64_t hid5_val = mfspr(SPRN_HID5);
-	uint64_t hmeer_val = mfspr(SPRN_HMEER);
-
-	for_each_possible_cpu(cpu) {
-		uint64_t pir = get_hard_smp_processor_id(cpu);
-		uint64_t hsprg0_val = (uint64_t)&paca[cpu];
-
-		/*
-		 * HSPRG0 is used to store the cpu's pointer to paca. Hence last
-		 * 3 bits are guaranteed to be 0. Program slw to restore HSPRG0
-		 * with 63rd bit set, so that when a thread wakes up at 0x100 we
-		 * can use this bit to distinguish between fastsleep and
-		 * deep winkle.
-		 */
-		hsprg0_val |= 1;
-
-		rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
-		if (rc != 0)
-			return rc;
-
-		rc = opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val);
-		if (rc != 0)
-			return rc;
-
-		/* HIDs are per core registers */
-		if (cpu_thread_in_core(cpu) == 0) {
-
-			rc = opal_slw_set_reg(pir, SPRN_HMEER, hmeer_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID0, hid0_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID4, hid4_val);
-			if (rc != 0)
-				return rc;
-
-			rc = opal_slw_set_reg(pir, SPRN_HID5, hid5_val);
-			if (rc != 0)
-				return rc;
-		}
-	}
-
-	return 0;
-}
-
-static void pnv_alloc_idle_core_states(void)
-{
-	int i, j;
-	int nr_cores = cpu_nr_cores();
-	u32 *core_idle_state;
-
-	/*
-	 * core_idle_state - First 8 bits track the idle state of each thread
-	 * of the core. The 8th bit is the lock bit. Initially all thread bits
-	 * are set. They are cleared when the thread enters deep idle state
-	 * like sleep and winkle. Initially the lock bit is cleared.
-	 * The lock bit has 2 purposes
-	 * a. While the first thread is restoring core state, it prevents
-	 * other threads in the core from switching to process context.
-	 * b. While the last thread in the core is saving the core state, it
-	 * prevents a different thread from waking up.
-	 */
-	for (i = 0; i < nr_cores; i++) {
-		int first_cpu = i * threads_per_core;
-		int node = cpu_to_node(first_cpu);
-
-		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
-		*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
-
-		for (j = 0; j < threads_per_core; j++) {
-			int cpu = first_cpu + j;
-
-			paca[cpu].core_idle_state_ptr = core_idle_state;
-			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
-			paca[cpu].thread_mask = 1 << j;
-		}
-	}
-
-	update_subcore_sibling_mask();
-
-	if (supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED)
-		pnv_save_sprs_for_winkle();
-}
-
-u32 pnv_get_supported_cpuidle_states(void)
-{
-	return supported_cpuidle_states;
-}
-EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
-
-static int __init pnv_init_idle_states(void)
-{
-	struct device_node *power_mgt;
-	int dt_idle_states;
-	u32 *flags;
-	int i;
-
-	supported_cpuidle_states = 0;
-
-	if (cpuidle_disable != IDLE_NO_OVERRIDE)
-		goto out;
-
-	if (!firmware_has_feature(FW_FEATURE_OPALv3))
-		goto out;
-
-	power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
-	if (!power_mgt) {
-		pr_warn("opal: PowerMgmt Node not found\n");
-		goto out;
-	}
-	dt_idle_states = of_property_count_u32_elems(power_mgt,
-			"ibm,cpu-idle-state-flags");
-	if (dt_idle_states < 0) {
-		pr_warn("cpuidle-powernv: no idle states found in the DT\n");
-		goto out;
-	}
-
-	flags = kzalloc(sizeof(*flags) * dt_idle_states, GFP_KERNEL);
-	if (of_property_read_u32_array(power_mgt,
-			"ibm,cpu-idle-state-flags", flags, dt_idle_states)) {
-		pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-flags in DT\n");
-		goto out_free;
-	}
-
-	for (i = 0; i < dt_idle_states; i++)
-		supported_cpuidle_states |= flags[i];
-
-	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_entry,
-			PPC_INST_NOP);
-		patch_instruction(
-			(unsigned int *)pnv_fastsleep_workaround_at_exit,
-			PPC_INST_NOP);
-	}
-	pnv_alloc_idle_core_states();
-out_free:
-	kfree(flags);
-out:
-	return 0;
-}
-
-subsys_initcall(pnv_init_idle_states);
-
 static int __init pnv_probe(void)
 {
 	unsigned long root = of_get_flat_dt_root();
@@ -492,7 +314,6 @@ define_machine(powernv) {
 	.machine_shutdown	= pnv_shutdown,
 	.power_save             = power7_idle,
 	.calibrate_decr		= generic_calibrate_decr,
-	.dma_set_mask		= pnv_dma_set_mask,
 	.dma_get_required_mask	= pnv_dma_get_required_mask,
 #ifdef CONFIG_KEXEC
 	.kexec_cpu_down		= pnv_kexec_cpu_down,
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 019d34aaf054..47d9cebe7159 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -421,11 +421,10 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
 		return -ENODEV;
 
 	dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent);
+	of_node_put(parent);
 	if (!dn)
 		return -EINVAL;
 
-	of_node_put(parent);
-
 	rc = dlpar_attach_node(dn);
 	if (rc) {
 		dlpar_release_drc(drc_index);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 2039397cc75d..1ba55d0bb449 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -519,7 +519,7 @@ static int pseries_eeh_reset(struct eeh_pe *pe, int option)
 /**
  * pseries_eeh_wait_state - Wait for PE state
  * @pe: EEH PE
- * @max_wait: maximal period in microsecond
+ * @max_wait: maximal period in millisecond
  *
  * Wait for the state of associated PE. It might take some time
  * to retrieve the PE's state.
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 61d5a17f45c0..10510dea16b3 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -36,6 +36,8 @@
 #include <linux/crash_dump.h>
 #include <linux/memory.h>
 #include <linux/of.h>
+#include <linux/iommu.h>
+#include <linux/rculist.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -51,6 +53,73 @@
 
 #include "pseries.h"
 
+static struct iommu_table_group *iommu_pseries_alloc_group(int node)
+{
+	struct iommu_table_group *table_group = NULL;
+	struct iommu_table *tbl = NULL;
+	struct iommu_table_group_link *tgl = NULL;
+
+	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
+			   node);
+	if (!table_group)
+		goto fail_exit;
+
+	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
+	if (!tbl)
+		goto fail_exit;
+
+	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
+			node);
+	if (!tgl)
+		goto fail_exit;
+
+	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+	tgl->table_group = table_group;
+	list_add_rcu(&tgl->next, &tbl->it_group_list);
+
+	table_group->tables[0] = tbl;
+
+	return table_group;
+
+fail_exit:
+	kfree(tgl);
+	kfree(table_group);
+	kfree(tbl);
+
+	return NULL;
+}
+
+static void iommu_pseries_free_group(struct iommu_table_group *table_group,
+		const char *node_name)
+{
+	struct iommu_table *tbl;
+#ifdef CONFIG_IOMMU_API
+	struct iommu_table_group_link *tgl;
+#endif
+
+	if (!table_group)
+		return;
+
+	tbl = table_group->tables[0];
+#ifdef CONFIG_IOMMU_API
+	tgl = list_first_entry_or_null(&tbl->it_group_list,
+			struct iommu_table_group_link, next);
+
+	WARN_ON_ONCE(!tgl);
+	if (tgl) {
+		list_del_rcu(&tgl->next);
+		kfree(tgl);
+	}
+	if (table_group->group) {
+		iommu_group_put(table_group->group);
+		BUG_ON(table_group->group);
+	}
+#endif
+	iommu_free_table(tbl, node_name);
+
+	kfree(table_group);
+}
+
 static void tce_invalidate_pSeries_sw(struct iommu_table *tbl,
 				      __be64 *startp, __be64 *endp)
 {
@@ -193,7 +262,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
 	int ret = 0;
 	unsigned long flags;
 
-	if (npages == 1) {
+	if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
 		return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
 		                           direction, attrs);
 	}
@@ -285,6 +354,9 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n
 {
 	u64 rc;
 
+	if (!firmware_has_feature(FW_FEATURE_MULTITCE))
+		return tce_free_pSeriesLP(tbl, tcenum, npages);
+
 	rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages);
 
 	if (rc && printk_ratelimit()) {
@@ -460,7 +532,6 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn,
 	return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
 }
 
-
 #ifdef CONFIG_PCI
 static void iommu_table_setparms(struct pci_controller *phb,
 				 struct device_node *dn,
@@ -546,6 +617,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	tbl->it_size = size >> tbl->it_page_shift;
 }
 
+struct iommu_table_ops iommu_table_pseries_ops = {
+	.set = tce_build_pSeries,
+	.clear = tce_free_pSeries,
+	.get = tce_get_pseries
+};
+
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 {
 	struct device_node *dn;
@@ -610,12 +687,13 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pci->phb->dma_window_size = 0x8000000ul;
 	pci->phb->dma_window_base_cur = 0x8000000ul;
 
-	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-			   pci->phb->node);
+	pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+	tbl = pci->table_group->tables[0];
 
 	iommu_table_setparms(pci->phb, dn, tbl);
-	pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
-	iommu_register_group(tbl, pci_domain_nr(bus), 0);
+	tbl->it_ops = &iommu_table_pseries_ops;
+	iommu_init_table(tbl, pci->phb->node);
+	iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
 
 	/* Divide the rest (1.75GB) among the children */
 	pci->phb->dma_window_size = 0x80000000ul;
@@ -625,6 +703,11 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
 	pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
 }
 
+struct iommu_table_ops iommu_table_lpar_multi_ops = {
+	.set = tce_buildmulti_pSeriesLP,
+	.clear = tce_freemulti_pSeriesLP,
+	.get = tce_get_pSeriesLP
+};
 
 static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 {
@@ -653,15 +736,17 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
 	ppci = PCI_DN(pdn);
 
 	pr_debug("  parent is %s, iommu_table: 0x%p\n",
-		 pdn->full_name, ppci->iommu_table);
+		 pdn->full_name, ppci->table_group);
 
-	if (!ppci->iommu_table) {
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   ppci->phb->node);
+	if (!ppci->table_group) {
+		ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
+		tbl = ppci->table_group->tables[0];
 		iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
-		ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node);
-		iommu_register_group(tbl, pci_domain_nr(bus), 0);
-		pr_debug("  created table: %p\n", ppci->iommu_table);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+		iommu_init_table(tbl, ppci->phb->node);
+		iommu_register_group(ppci->table_group,
+				pci_domain_nr(bus), 0);
+		pr_debug("  created table: %p\n", ppci->table_group);
 	}
 }
 
@@ -683,13 +768,15 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		struct pci_controller *phb = PCI_DN(dn)->phb;
 
 		pr_debug(" --> first child, no bridge. Allocating iommu table.\n");
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   phb->node);
+		PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node);
+		tbl = PCI_DN(dn)->table_group->tables[0];
 		iommu_table_setparms(phb, dn, tbl);
-		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
-		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
-		set_iommu_table_base_and_group(&dev->dev,
-					       PCI_DN(dn)->iommu_table);
+		tbl->it_ops = &iommu_table_pseries_ops;
+		iommu_init_table(tbl, phb->node);
+		iommu_register_group(PCI_DN(dn)->table_group,
+				pci_domain_nr(phb->bus), 0);
+		set_iommu_table_base(&dev->dev, tbl);
+		iommu_add_device(&dev->dev);
 		return;
 	}
 
@@ -697,13 +784,14 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 	 * an already allocated iommu table is found and use that.
 	 */
 
-	while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL)
+	while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
 		dn = dn->parent;
 
-	if (dn && PCI_DN(dn))
-		set_iommu_table_base_and_group(&dev->dev,
-					       PCI_DN(dn)->iommu_table);
-	else
+	if (dn && PCI_DN(dn)) {
+		set_iommu_table_base(&dev->dev,
+				PCI_DN(dn)->table_group->tables[0]);
+		iommu_add_device(&dev->dev);
+	} else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 		       pci_name(dev));
 }
@@ -1088,7 +1176,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	dn = pci_device_to_OF_node(dev);
 	pr_debug("  node is %s\n", dn->full_name);
 
-	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+	for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 	     pdn = pdn->parent) {
 		dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 		if (dma_window)
@@ -1104,18 +1192,21 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 	pr_debug("  parent is %s\n", pdn->full_name);
 
 	pci = PCI_DN(pdn);
-	if (!pci->iommu_table) {
-		tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL,
-				   pci->phb->node);
+	if (!pci->table_group) {
+		pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
+		tbl = pci->table_group->tables[0];
 		iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
-		pci->iommu_table = iommu_init_table(tbl, pci->phb->node);
-		iommu_register_group(tbl, pci_domain_nr(pci->phb->bus), 0);
-		pr_debug("  created table: %p\n", pci->iommu_table);
+		tbl->it_ops = &iommu_table_lpar_multi_ops;
+		iommu_init_table(tbl, pci->phb->node);
+		iommu_register_group(pci->table_group,
+				pci_domain_nr(pci->phb->bus), 0);
+		pr_debug("  created table: %p\n", pci->table_group);
 	} else {
-		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
+		pr_debug("  found DMA window, table: %p\n", pci->table_group);
 	}
 
-	set_iommu_table_base_and_group(&dev->dev, pci->iommu_table);
+	set_iommu_table_base(&dev->dev, pci->table_group->tables[0]);
+	iommu_add_device(&dev->dev);
 }
 
 static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
@@ -1145,7 +1236,7 @@ static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
 		 * search upwards in the tree until we either hit a dma-window
 		 * property, OR find a parent with a table already allocated.
 		 */
-		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table;
+		for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->table_group;
 				pdn = pdn->parent) {
 			dma_window = of_get_property(pdn, "ibm,dma-window", NULL);
 			if (dma_window)
@@ -1189,7 +1280,7 @@ static u64 dma_get_required_mask_pSeriesLP(struct device *dev)
 		dn = pci_device_to_OF_node(pdev);
 
 		/* search upwards for ibm,dma-window */
-		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->iommu_table;
+		for (; dn && PCI_DN(dn) && !PCI_DN(dn)->table_group;
 				dn = dn->parent)
 			if (of_get_property(dn, "ibm,dma-window", NULL))
 				break;
@@ -1269,8 +1360,9 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti
 		 * the device node.
 		 */
 		remove_ddw(np, false);
-		if (pci && pci->iommu_table)
-			iommu_free_table(pci->iommu_table, np->full_name);
+		if (pci && pci->table_group)
+			iommu_pseries_free_group(pci->table_group,
+					np->full_name);
 
 		spin_lock(&direct_window_list_lock);
 		list_for_each_entry(window, &direct_window_list, list) {
@@ -1300,22 +1392,11 @@ void iommu_init_early_pSeries(void)
 		return;
 
 	if (firmware_has_feature(FW_FEATURE_LPAR)) {
-		if (firmware_has_feature(FW_FEATURE_MULTITCE)) {
-			ppc_md.tce_build = tce_buildmulti_pSeriesLP;
-			ppc_md.tce_free	 = tce_freemulti_pSeriesLP;
-		} else {
-			ppc_md.tce_build = tce_build_pSeriesLP;
-			ppc_md.tce_free	 = tce_free_pSeriesLP;
-		}
-		ppc_md.tce_get   = tce_get_pSeriesLP;
 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
 		ppc_md.dma_set_mask = dma_set_mask_pSeriesLP;
 		ppc_md.dma_get_required_mask = dma_get_required_mask_pSeriesLP;
 	} else {
-		ppc_md.tce_build = tce_build_pSeries;
-		ppc_md.tce_free  = tce_free_pSeries;
-		ppc_md.tce_get   = tce_get_pseries;
 		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeries;
 		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeries;
 	}
@@ -1333,8 +1414,6 @@ static int __init disable_multitce(char *str)
 	    firmware_has_feature(FW_FEATURE_LPAR) &&
 	    firmware_has_feature(FW_FEATURE_MULTITCE)) {
 		printk(KERN_INFO "Disabling MULTITCE firmware feature\n");
-		ppc_md.tce_build = tce_build_pSeriesLP;
-		ppc_md.tce_free	 = tce_free_pSeriesLP;
 		powerpc_firmware_features &= ~FW_FEATURE_MULTITCE;
 	}
 	return 1;
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index c8d24f9a6948..c22bb647cce6 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -18,6 +18,8 @@
 #include <asm/ppc-pci.h>
 #include <asm/machdep.h>
 
+#include "pseries.h"
+
 static int query_token, change_token;
 
 #define RTAS_QUERY_FN		0
@@ -505,6 +507,8 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev)
 
 static int rtas_msi_init(void)
 {
+	struct pci_controller *phb;
+
 	query_token  = rtas_token("ibm,query-interrupt-source-number");
 	change_token = rtas_token("ibm,change-msi");
 
@@ -516,9 +520,15 @@ static int rtas_msi_init(void)
 
 	pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n");
 
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = rtas_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs;
+	WARN_ON(pseries_pci_controller_ops.setup_msi_irqs);
+	pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+	pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs;
+	}
 
 	WARN_ON(ppc_md.pci_irq_fixup);
 	ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup;
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index f7cb2a1b01fa..5b492a6438ff 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -2,7 +2,7 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 
 ccflags-$(CONFIG_PPC64)		:= $(NO_MINIMAL_TOC)
 
-mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o mpic_pasemi_msi.o
+mpic-msi-obj-$(CONFIG_PCI_MSI)	+= mpic_msi.o mpic_u3msi.o
 obj-$(CONFIG_MPIC)		+= mpic.o $(mpic-msi-obj-y)
 obj-$(CONFIG_MPIC_TIMER)        += mpic_timer.o
 obj-$(CONFIG_FSL_MPIC_TIMER_WAKEUP)	+= fsl_mpic_timer_wakeup.o
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index d00a5663e312..90bcdfeedf48 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -286,6 +286,12 @@ static int __init dart_init(struct device_node *dart_node)
 	return 0;
 }
 
+static struct iommu_table_ops iommu_dart_ops = {
+	.set = dart_build,
+	.clear = dart_free,
+	.flush = dart_flush,
+};
+
 static void iommu_table_dart_setup(void)
 {
 	iommu_table_dart.it_busno = 0;
@@ -298,6 +304,7 @@ static void iommu_table_dart_setup(void)
 	iommu_table_dart.it_base = (unsigned long)dart_vbase;
 	iommu_table_dart.it_index = 0;
 	iommu_table_dart.it_blocksize = 1;
+	iommu_table_dart.it_ops = &iommu_dart_ops;
 	iommu_init_table(&iommu_table_dart, -1);
 
 	/* Reserve the last page of the DART to avoid possible prefetch
@@ -386,11 +393,6 @@ void __init iommu_init_early_dart(struct pci_controller_ops *controller_ops)
 	if (dart_init(dn) != 0)
 		goto bail;
 
-	/* Setup low level TCE operations for the core IOMMU code */
-	ppc_md.tce_build = dart_build;
-	ppc_md.tce_free  = dart_free;
-	ppc_md.tce_flush = dart_flush;
-
 	/* Setup bypass if supported */
 	if (dart_is_u4)
 		ppc_md.dma_set_mask = dart_dma_set_mask;
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index f086c6f22dc9..5236e5427c38 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -405,6 +405,7 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 	const struct fsl_msi_feature *features;
 	int len;
 	u32 offset;
+	struct pci_controller *phb;
 
 	match = of_match_device(fsl_of_msi_ids, &dev->dev);
 	if (!match)
@@ -541,14 +542,20 @@ static int fsl_of_msi_probe(struct platform_device *dev)
 
 	list_add_tail(&msi->list, &msi_head);
 
-	/* The multiple setting ppc_md.setup_msi_irqs will not harm things */
-	if (!ppc_md.setup_msi_irqs) {
-		ppc_md.setup_msi_irqs = fsl_setup_msi_irqs;
-		ppc_md.teardown_msi_irqs = fsl_teardown_msi_irqs;
-	} else if (ppc_md.setup_msi_irqs != fsl_setup_msi_irqs) {
-		dev_err(&dev->dev, "Different MSI driver already installed!\n");
-		err = -ENODEV;
-		goto error_out;
+	/*
+	 * Apply the MSI ops to all the controllers.
+	 * It doesn't hurt to reassign the same ops,
+	 * but bail out if we find another MSI driver.
+	 */
+	list_for_each_entry(phb, &hose_list, list_node) {
+		if (!phb->controller_ops.setup_msi_irqs) {
+			phb->controller_ops.setup_msi_irqs = fsl_setup_msi_irqs;
+			phb->controller_ops.teardown_msi_irqs = fsl_teardown_msi_irqs;
+		} else if (phb->controller_ops.setup_msi_irqs != fsl_setup_msi_irqs) {
+			dev_err(&dev->dev, "Different MSI driver already installed!\n");
+			err = -ENODEV;
+			goto error_out;
+		}
 	}
 	return 0;
 error_out:
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 45598da0b321..31c33475c7b7 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -204,7 +204,7 @@ static int i8259_host_xlate(struct irq_domain *h, struct device_node *ct,
 	return 0;
 }
 
-static struct irq_domain_ops i8259_host_ops = {
+static const struct irq_domain_ops i8259_host_ops = {
 	.match = i8259_host_match,
 	.map = i8259_host_map,
 	.xlate = i8259_host_xlate,
diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index b28733727ed3..d78f1364b639 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -691,7 +691,7 @@ static int ipic_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops ipic_host_ops = {
+static const struct irq_domain_ops ipic_host_ops = {
 	.match	= ipic_host_match,
 	.map	= ipic_host_map,
 	.xlate	= irq_domain_xlate_onetwocell,
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c
index c4828c0be5bd..d93a78be4346 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/sysdev/mpc8xx_pic.c
@@ -120,7 +120,7 @@ static int mpc8xx_pic_host_xlate(struct irq_domain *h, struct device_node *ct,
 }
 
 
-static struct irq_domain_ops mpc8xx_pic_host_ops = {
+static const struct irq_domain_ops mpc8xx_pic_host_ops = {
 	.map = mpc8xx_pic_host_map,
 	.xlate = mpc8xx_pic_host_xlate,
 };
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index b2b8447a227a..c8e73332eaad 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -1195,7 +1195,7 @@ static void mpic_cascade(unsigned int irq, struct irq_desc *desc)
 	chip->irq_eoi(&desc->irq_data);
 }
 
-static struct irq_domain_ops mpic_host_ops = {
+static const struct irq_domain_ops mpic_host_ops = {
 	.match = mpic_host_match,
 	.map = mpic_host_map,
 	.xlate = mpic_host_xlate,
diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 24bf07a63924..32971a41853b 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -15,7 +15,6 @@
 extern void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq);
 extern int mpic_msi_init_allocator(struct mpic *mpic);
 extern int mpic_u3msi_init(struct mpic *mpic);
-extern int mpic_pasemi_msi_init(struct mpic *mpic);
 #else
 static inline void mpic_msi_reserve_hwirq(struct mpic *mpic,
 					  irq_hw_number_t hwirq)
@@ -27,11 +26,12 @@ static inline int mpic_u3msi_init(struct mpic *mpic)
 {
 	return -1;
 }
+#endif
 
-static inline int mpic_pasemi_msi_init(struct mpic *mpic)
-{
-	return -1;
-}
+#if defined(CONFIG_PCI_MSI) && defined(CONFIG_PPC_PASEMI)
+int mpic_pasemi_msi_init(struct mpic *mpic);
+#else
+static inline int mpic_pasemi_msi_init(struct mpic *mpic) { return -1; }
 #endif
 
 extern int mpic_set_irq_type(struct irq_data *d, unsigned int flow_type);
diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c
index b2cef1809389..fc46ef3b816e 100644
--- a/arch/powerpc/sysdev/mpic_u3msi.c
+++ b/arch/powerpc/sysdev/mpic_u3msi.c
@@ -181,6 +181,7 @@ static int u3msi_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 int mpic_u3msi_init(struct mpic *mpic)
 {
 	int rc;
+	struct pci_controller *phb;
 
 	rc = mpic_msi_init_allocator(mpic);
 	if (rc) {
@@ -193,9 +194,11 @@ int mpic_u3msi_init(struct mpic *mpic)
 	BUG_ON(msi_mpic);
 	msi_mpic = mpic;
 
-	WARN_ON(ppc_md.setup_msi_irqs);
-	ppc_md.setup_msi_irqs = u3msi_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = u3msi_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		WARN_ON(phb->controller_ops.setup_msi_irqs);
+		phb->controller_ops.setup_msi_irqs = u3msi_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = u3msi_teardown_msi_irqs;
+	}
 
 	return 0;
 }
diff --git a/arch/powerpc/sysdev/mv64x60_pic.c b/arch/powerpc/sysdev/mv64x60_pic.c
index 8848e99a83f2..0f842dd16bcd 100644
--- a/arch/powerpc/sysdev/mv64x60_pic.c
+++ b/arch/powerpc/sysdev/mv64x60_pic.c
@@ -223,7 +223,7 @@ static int mv64x60_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops mv64x60_host_ops = {
+static const struct irq_domain_ops mv64x60_host_ops = {
 	.map   = mv64x60_host_map,
 };
 
diff --git a/arch/powerpc/sysdev/ppc4xx_hsta_msi.c b/arch/powerpc/sysdev/ppc4xx_hsta_msi.c
index f366d2d4c079..2bc33674ebfc 100644
--- a/arch/powerpc/sysdev/ppc4xx_hsta_msi.c
+++ b/arch/powerpc/sysdev/ppc4xx_hsta_msi.c
@@ -128,6 +128,7 @@ static int hsta_msi_probe(struct platform_device *pdev)
 	struct device *dev = &pdev->dev;
 	struct resource *mem;
 	int irq, ret, irq_count;
+	struct pci_controller *phb;
 
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (IS_ERR(mem)) {
@@ -171,8 +172,10 @@ static int hsta_msi_probe(struct platform_device *pdev)
 		}
 	}
 
-	ppc_md.setup_msi_irqs = hsta_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = hsta_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		phb->controller_ops.setup_msi_irqs = hsta_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = hsta_teardown_msi_irqs;
+	}
 	return 0;
 
 out2:
diff --git a/arch/powerpc/sysdev/ppc4xx_msi.c b/arch/powerpc/sysdev/ppc4xx_msi.c
index 6e2e6aa378bb..6eb21f2ea585 100644
--- a/arch/powerpc/sysdev/ppc4xx_msi.c
+++ b/arch/powerpc/sysdev/ppc4xx_msi.c
@@ -218,6 +218,7 @@ static int ppc4xx_msi_probe(struct platform_device *dev)
 	struct ppc4xx_msi *msi;
 	struct resource res;
 	int err = 0;
+	struct pci_controller *phb;
 
 	dev_dbg(&dev->dev, "PCIE-MSI: Setting up MSI support...\n");
 
@@ -250,8 +251,10 @@ static int ppc4xx_msi_probe(struct platform_device *dev)
 	}
 	ppc4xx_msi = *msi;
 
-	ppc_md.setup_msi_irqs = ppc4xx_setup_msi_irqs;
-	ppc_md.teardown_msi_irqs = ppc4xx_teardown_msi_irqs;
+	list_for_each_entry(phb, &hose_list, list_node) {
+		phb->controller_ops.setup_msi_irqs = ppc4xx_setup_msi_irqs;
+		phb->controller_ops.teardown_msi_irqs = ppc4xx_teardown_msi_irqs;
+	}
 	return err;
 
 error_out:
diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c
index 543765e1ef14..6512cd8caa51 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_ic.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c
@@ -271,7 +271,7 @@ static int qe_ic_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops qe_ic_host_ops = {
+static const struct irq_domain_ops qe_ic_host_ops = {
 	.match = qe_ic_host_match,
 	.map = qe_ic_host_map,
 	.xlate = irq_domain_xlate_onetwocell,
diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c
index 188012c58f7f..57b54476e747 100644
--- a/arch/powerpc/sysdev/tsi108_pci.c
+++ b/arch/powerpc/sysdev/tsi108_pci.c
@@ -397,7 +397,7 @@ static int pci_irq_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops pci_irq_domain_ops = {
+static const struct irq_domain_ops pci_irq_domain_ops = {
 	.map = pci_irq_host_map,
 	.xlate = pci_irq_host_xlate,
 };
diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c
index 7c37157d4c24..d77345338671 100644
--- a/arch/powerpc/sysdev/uic.c
+++ b/arch/powerpc/sysdev/uic.c
@@ -189,7 +189,7 @@ static int uic_host_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops uic_host_ops = {
+static const struct irq_domain_ops uic_host_ops = {
 	.map	= uic_host_map,
 	.xlate	= irq_domain_xlate_twocell,
 };
@@ -198,7 +198,7 @@ void uic_irq_cascade(unsigned int virq, struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct irq_data *idata = irq_desc_get_irq_data(desc);
-	struct uic *uic = irq_get_handler_data(virq);
+	struct uic *uic = irq_desc_get_handler_data(desc);
 	u32 msr;
 	int src;
 	int subvirq;
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 2fc4cf1b7557..eae32654bdf2 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -147,12 +147,16 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 {
 	kvmppc_set_host_ipi(cpu, 1);
 #ifdef CONFIG_PPC_DOORBELL
-	if (cpu_has_feature(CPU_FTR_DBELL) &&
-	    (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))))
-		doorbell_cause_ipi(cpu, data);
-	else
+	if (cpu_has_feature(CPU_FTR_DBELL)) {
+		if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) {
+			doorbell_cause_ipi(cpu, data);
+			put_cpu();
+			return;
+		}
+		put_cpu();
+	}
 #endif
-		icp_native_set_qirr(cpu, IPI_PRIORITY);
+	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
 /*
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 878a54036a25..08c248eb491b 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -227,7 +227,7 @@ void xics_migrate_irqs_away(void)
 
 		/* Locate interrupt server */
 		server = -1;
-		ics = irq_get_chip_data(virq);
+		ics = irq_desc_get_chip_data(desc);
 		if (ics)
 			server = ics->get_server(ics, irq);
 		if (server < 0) {
@@ -360,7 +360,7 @@ static int xics_host_xlate(struct irq_domain *h, struct device_node *ct,
 	return 0;
 }
 
-static struct irq_domain_ops xics_host_ops = {
+static const struct irq_domain_ops xics_host_ops = {
 	.match = xics_host_match,
 	.map = xics_host_map,
 	.xlate = xics_host_xlate,
diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c
index 56f0524e47a6..43b8b275bc5c 100644
--- a/arch/powerpc/sysdev/xilinx_intc.c
+++ b/arch/powerpc/sysdev/xilinx_intc.c
@@ -179,7 +179,7 @@ static int xilinx_intc_map(struct irq_domain *h, unsigned int virq,
 	return 0;
 }
 
-static struct irq_domain_ops xilinx_intc_ops = {
+static const struct irq_domain_ops xilinx_intc_ops = {
 	.map = xilinx_intc_map,
 	.xlate = xilinx_intc_xlate,
 };
diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c
index 7940dc90e80b..b258110da952 100644
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@@ -16,11 +16,12 @@
 #define GHASH_DIGEST_SIZE	16
 
 struct ghash_ctx {
-	u8 icv[16];
-	u8 key[16];
+	u8 key[GHASH_BLOCK_SIZE];
 };
 
 struct ghash_desc_ctx {
+	u8 icv[GHASH_BLOCK_SIZE];
+	u8 key[GHASH_BLOCK_SIZE];
 	u8 buffer[GHASH_BLOCK_SIZE];
 	u32 bytes;
 };
@@ -28,8 +29,10 @@ struct ghash_desc_ctx {
 static int ghash_init(struct shash_desc *desc)
 {
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 
 	memset(dctx, 0, sizeof(*dctx));
+	memcpy(dctx->key, ctx->key, GHASH_BLOCK_SIZE);
 
 	return 0;
 }
@@ -45,7 +48,6 @@ static int ghash_setkey(struct crypto_shash *tfm,
 	}
 
 	memcpy(ctx->key, key, GHASH_BLOCK_SIZE);
-	memset(ctx->icv, 0, GHASH_BLOCK_SIZE);
 
 	return 0;
 }
@@ -54,7 +56,6 @@ static int ghash_update(struct shash_desc *desc,
 			 const u8 *src, unsigned int srclen)
 {
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 	unsigned int n;
 	u8 *buf = dctx->buffer;
 	int ret;
@@ -70,7 +71,7 @@ static int ghash_update(struct shash_desc *desc,
 		src += n;
 
 		if (!dctx->bytes) {
-			ret = crypt_s390_kimd(KIMD_GHASH, ctx, buf,
+			ret = crypt_s390_kimd(KIMD_GHASH, dctx, buf,
 					      GHASH_BLOCK_SIZE);
 			if (ret != GHASH_BLOCK_SIZE)
 				return -EIO;
@@ -79,7 +80,7 @@ static int ghash_update(struct shash_desc *desc,
 
 	n = srclen & ~(GHASH_BLOCK_SIZE - 1);
 	if (n) {
-		ret = crypt_s390_kimd(KIMD_GHASH, ctx, src, n);
+		ret = crypt_s390_kimd(KIMD_GHASH, dctx, src, n);
 		if (ret != n)
 			return -EIO;
 		src += n;
@@ -94,7 +95,7 @@ static int ghash_update(struct shash_desc *desc,
 	return 0;
 }
 
-static int ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
+static int ghash_flush(struct ghash_desc_ctx *dctx)
 {
 	u8 *buf = dctx->buffer;
 	int ret;
@@ -104,24 +105,24 @@ static int ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
 
 		memset(pos, 0, dctx->bytes);
 
-		ret = crypt_s390_kimd(KIMD_GHASH, ctx, buf, GHASH_BLOCK_SIZE);
+		ret = crypt_s390_kimd(KIMD_GHASH, dctx, buf, GHASH_BLOCK_SIZE);
 		if (ret != GHASH_BLOCK_SIZE)
 			return -EIO;
+
+		dctx->bytes = 0;
 	}
 
-	dctx->bytes = 0;
 	return 0;
 }
 
 static int ghash_final(struct shash_desc *desc, u8 *dst)
 {
 	struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
-	struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
 	int ret;
 
-	ret = ghash_flush(ctx, dctx);
+	ret = ghash_flush(dctx);
 	if (!ret)
-		memcpy(dst, ctx->icv, GHASH_BLOCK_SIZE);
+		memcpy(dst, dctx->icv, GHASH_BLOCK_SIZE);
 	return ret;
 }
 
diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c
index 1f374b39a4ec..9d5192c94963 100644
--- a/arch/s390/crypto/prng.c
+++ b/arch/s390/crypto/prng.c
@@ -125,7 +125,7 @@ static int generate_entropy(u8 *ebuf, size_t nbytes)
 		/* fill page with urandom bytes */
 		get_random_bytes(pg, PAGE_SIZE);
 		/* exor page with stckf values */
-		for (n = 0; n < sizeof(PAGE_SIZE/sizeof(u64)); n++) {
+		for (n = 0; n < PAGE_SIZE / sizeof(u64); n++) {
 			u64 *p = ((u64 *)pg) + n;
 			*p ^= get_tod_clock_fast();
 		}
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index f043c3c7e73c..dd42a26d049d 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -128,14 +128,14 @@ static struct hypfs_dbfs_file hypfs_sprp_file = {
 
 int hypfs_sprp_init(void)
 {
-	if (!sclp_has_sprp())
+	if (!sclp.has_sprp)
 		return 0;
 	return hypfs_dbfs_create_file(&hypfs_sprp_file);
 }
 
 void hypfs_sprp_exit(void)
 {
-	if (!sclp_has_sprp())
+	if (!sclp.has_sprp)
 		return;
 	hypfs_dbfs_remove_file(&hypfs_sprp_file);
 }
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 8d724718ec21..e6f8615a11eb 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()		smp_mb()
 #define smp_mb__after_atomic()		smp_mb()
 
-#define set_mb(var, value)		do { var = value; mb(); } while (0)
+#define smp_store_mb(var, value)		do { WRITE_ONCE(var, value); mb(); } while (0)
 
 #define smp_store_release(p, v)						\
 do {									\
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 4eadec466b8c..411464f4c97a 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -32,8 +32,6 @@
 	__old;								\
 })
 
-#define __HAVE_ARCH_CMPXCHG
-
 #define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn)		\
 ({									\
 	register __typeof__(*(p1)) __old1 asm("2") = (o1);		\
diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 30fd5c84680e..cb5fdf3a78fc 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -29,6 +29,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 
 #define ioremap_nocache(addr, size)	ioremap(addr, size)
 #define ioremap_wc			ioremap_nocache
+#define ioremap_wt			ioremap_nocache
 
 static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
 {
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d01fc588b5c3..3024acbe1f9d 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -80,6 +80,7 @@ struct sca_block {
 #define CPUSTAT_MCDS       0x00000100
 #define CPUSTAT_SM         0x00000080
 #define CPUSTAT_IBS        0x00000040
+#define CPUSTAT_GED2       0x00000010
 #define CPUSTAT_G          0x00000008
 #define CPUSTAT_GED        0x00000004
 #define CPUSTAT_J          0x00000002
@@ -95,7 +96,8 @@ struct kvm_s390_sie_block {
 #define PROG_IN_SIE (1<<0)
 	__u32	prog0c;			/* 0x000c */
 	__u8	reserved10[16];		/* 0x0010 */
-#define PROG_BLOCK_SIE 0x00000001
+#define PROG_BLOCK_SIE	(1<<0)
+#define PROG_REQUEST	(1<<1)
 	atomic_t prog20;		/* 0x0020 */
 	__u8	reserved24[4];		/* 0x0024 */
 	__u64	cputm;			/* 0x0028 */
@@ -634,7 +636,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index fc642399b489..0bb2da79adf3 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -494,7 +494,7 @@ static inline int pmd_large(pmd_t pmd)
 	return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
 }
 
-static inline int pmd_pfn(pmd_t pmd)
+static inline unsigned long pmd_pfn(pmd_t pmd)
 {
 	unsigned long origin_mask;
 
@@ -1565,9 +1565,9 @@ static inline int has_transparent_hugepage(void)
 /*
  * 64 bit swap entry format:
  * A page-table entry has some bits we have to treat in a special way.
- * Bits 52 and bit 55 have to be zero, otherwise an specification
+ * Bits 52 and bit 55 have to be zero, otherwise a specification
  * exception will occur instead of a page translation exception. The
- * specifiation exception has the bad habit not to store necessary
+ * specification exception has the bad habit not to store necessary
  * information in the lowcore.
  * Bits 54 and 63 are used to indicate the page type.
  * A swap pte is indicated by bit pattern (pte & 0x201) == 0x200
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index f1096bab5199..c891f41b2753 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -46,33 +46,39 @@ struct sclp_cpu_info {
 	struct sclp_cpu_entry cpu[MAX_CPU_ADDRESS + 1];
 };
 
+struct sclp_info {
+	unsigned char has_linemode : 1;
+	unsigned char has_vt220 : 1;
+	unsigned char has_siif : 1;
+	unsigned char has_sigpif : 1;
+	unsigned char has_cpu_type : 1;
+	unsigned char has_sprp : 1;
+	unsigned int ibc;
+	unsigned int mtid;
+	unsigned int mtid_cp;
+	unsigned int mtid_prev;
+	unsigned long long rzm;
+	unsigned long long rnmax;
+	unsigned long long hamax;
+	unsigned int max_cpu;
+	unsigned long hsa_size;
+	unsigned long long facilities;
+};
+extern struct sclp_info sclp;
+
 int sclp_get_cpu_info(struct sclp_cpu_info *info);
 int sclp_cpu_configure(u8 cpu);
 int sclp_cpu_deconfigure(u8 cpu);
-unsigned long long sclp_get_rnmax(void);
-unsigned long long sclp_get_rzm(void);
-unsigned int sclp_get_max_cpu(void);
-unsigned int sclp_get_mtid(u8 cpu_type);
-unsigned int sclp_get_mtid_max(void);
-unsigned int sclp_get_mtid_prev(void);
 int sclp_sdias_blk_count(void);
 int sclp_sdias_copy(void *dest, int blk_num, int nr_blks);
 int sclp_chp_configure(struct chp_id chpid);
 int sclp_chp_deconfigure(struct chp_id chpid);
 int sclp_chp_read_info(struct sclp_chp_info *info);
 void sclp_get_ipl_info(struct sclp_ipl_info *info);
-bool __init sclp_has_linemode(void);
-bool __init sclp_has_vt220(void);
-bool sclp_has_sprp(void);
 int sclp_pci_configure(u32 fid);
 int sclp_pci_deconfigure(u32 fid);
 int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode);
-unsigned long sclp_get_hsa_size(void);
 void sclp_early_detect(void);
-int sclp_has_siif(void);
-int sclp_has_sigpif(void);
-unsigned int sclp_get_ibc(void);
-
 long _sclp_print_early(const char *);
 
 #endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 98eb2a579223..dcb6312a0b91 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -10,6 +10,7 @@
 #define _ASM_S390_TIMEX_H
 
 #include <asm/lowcore.h>
+#include <linux/time64.h>
 
 /* The value of the TOD clock for 1.1.1970. */
 #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL
@@ -108,10 +109,10 @@ int get_sync_clock(unsigned long long *clock);
 void init_cpu_timer(void);
 unsigned long long monotonic_clock(void);
 
-void tod_to_timeval(__u64, struct timespec *);
+void tod_to_timeval(__u64 todval, struct timespec64 *xt);
 
 static inline
-void stck_to_timespec(unsigned long long stck, struct timespec *ts)
+void stck_to_timespec64(unsigned long long stck, struct timespec64 *ts)
 {
 	tod_to_timeval(stck - TOD_UNIX_EPOCH, ts);
 }
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index b1453a2ae1ca..4990f6c66288 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -22,7 +22,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
 
 #define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id)
 #define topology_thread_id(cpu)		  (per_cpu(cpu_topology, cpu).thread_id)
-#define topology_thread_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).thread_mask)
+#define topology_sibling_cpumask(cpu) \
+		(&per_cpu(cpu_topology, cpu).thread_mask)
 #define topology_core_id(cpu)		  (per_cpu(cpu_topology, cpu).core_id)
 #define topology_core_cpumask(cpu)	  (&per_cpu(cpu_topology, cpu).core_mask)
 #define topology_book_id(cpu)		  (per_cpu(cpu_topology, cpu).book_id)
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index d64a7a62164f..9dd4cc47ddc7 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -98,7 +98,8 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
  * @from: Source address, in user space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -118,7 +119,8 @@ unsigned long __must_check __copy_from_user(void *to, const void __user *from,
  * @from: Source address, in kernel space.
  * @n:	  Number of bytes to copy.
  *
- * Context: User context only.	This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -264,7 +266,8 @@ int __get_user_bad(void) __attribute__((noreturn));
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -290,7 +293,8 @@ __compiletime_warning("copy_from_user() buffer size is not provably correct")
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
@@ -348,7 +352,8 @@ static inline unsigned long strnlen_user(const char __user *src, unsigned long n
  * strlen_user: - Get the size of a string in user space.
  * @str: The string to measure.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Get the size of a NUL-terminated string in user space.
  *
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 9f73c8059022..d9f0dcfcae5e 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -122,7 +122,7 @@ static ssize_t copy_oldmem_page_zfcpdump(char *buf, size_t csize,
 {
 	int rc;
 
-	if (src < sclp_get_hsa_size()) {
+	if (src < sclp.hsa_size) {
 		rc = memcpy_hsa(buf, src, csize, userbuf);
 	} else {
 		if (userbuf)
@@ -215,7 +215,7 @@ static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
 					   unsigned long pfn,
 					   unsigned long size, pgprot_t prot)
 {
-	unsigned long hsa_end = sclp_get_hsa_size();
+	unsigned long hsa_end = sclp.hsa_size;
 	unsigned long size_hsa;
 
 	if (pfn < hsa_end >> PAGE_SHIFT) {
@@ -258,7 +258,7 @@ int copy_from_oldmem(void *dest, void *src, size_t count)
 				return rc;
 		}
 	} else {
-		unsigned long hsa_end = sclp_get_hsa_size();
+		unsigned long hsa_end = sclp.hsa_size;
 		if ((unsigned long) src < hsa_end) {
 			copied = min(count, hsa_end - (unsigned long) src);
 			rc = memcpy_hsa(dest, (unsigned long) src, copied, 0);
@@ -609,7 +609,7 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
 	if (elfcorehdr_addr != ELFCORE_ADDR_MAX)
 		return 0;
 	/* If we cannot get HSA size for zfcpdump return error */
-	if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp_get_hsa_size())
+	if (ipl_info.type == IPL_TYPE_FCP_DUMP && !sclp.hsa_size)
 		return -ENODEV;
 
 	/* For kdump, exclude previous crashkernel memory */
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index c1f21aca76e7..6fca0e46464e 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -1457,23 +1457,24 @@ int
 debug_dflt_header_fn(debug_info_t * id, struct debug_view *view,
 			 int area, debug_entry_t * entry, char *out_buf)
 {
-	struct timespec time_spec;
+	struct timespec64 time_spec;
 	char *except_str;
 	unsigned long caller;
 	int rc = 0;
 	unsigned int level;
 
 	level = entry->id.fields.level;
-	stck_to_timespec(entry->id.stck, &time_spec);
+	stck_to_timespec64(entry->id.stck, &time_spec);
 
 	if (entry->id.fields.exception)
 		except_str = "*";
 	else
 		except_str = "-";
 	caller = ((unsigned long) entry->caller) & PSW_ADDR_INSN;
-	rc += sprintf(out_buf, "%02i %011lu:%06lu %1u %1s %02i %p  ",
-		      area, time_spec.tv_sec, time_spec.tv_nsec / 1000, level,
-		      except_str, entry->id.fields.cpuid, (void *) caller);
+	rc += sprintf(out_buf, "%02i %011lld:%06lu %1u %1s %02i %p  ",
+		      area, (long long)time_spec.tv_sec,
+		      time_spec.tv_nsec / 1000, level, except_str,
+		      entry->id.fields.cpuid, (void *)caller);
 	return rc;
 }
 EXPORT_SYMBOL(debug_dflt_header_fn);
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 99b44acbfcc7..3238893c9d4f 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -1005,7 +1005,7 @@ ENTRY(sie64a)
 .Lsie_gmap:
 	lg	%r14,__SF_EMPTY(%r15)		# get control block pointer
 	oi	__SIE_PROG0C+3(%r14),1		# we are going into SIE now
-	tm	__SIE_PROG20+3(%r14),1		# last exit...
+	tm	__SIE_PROG20+3(%r14),3		# last exit...
 	jnz	.Lsie_done
 	LPP	__SF_EMPTY(%r15)		# set guest id
 	sie	0(%r14)
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 7262fe438c99..af4f41d52cde 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -128,9 +128,9 @@ __setup("condev=", condev_setup);
 static void __init set_preferred_console(void)
 {
 	if (MACHINE_IS_KVM) {
-		if (sclp_has_vt220())
+		if (sclp.has_vt220)
 			add_preferred_console("ttyS", 1, NULL);
-		else if (sclp_has_linemode())
+		else if (sclp.has_linemode)
 			add_preferred_console("ttyS", 0, NULL);
 		else
 			add_preferred_console("hvc", 0, NULL);
@@ -510,8 +510,8 @@ static void reserve_memory_end(void)
 {
 #ifdef CONFIG_CRASH_DUMP
 	if (ipl_info.type == IPL_TYPE_FCP_DUMP &&
-	    !OLDMEM_BASE && sclp_get_hsa_size()) {
-		memory_end = sclp_get_hsa_size();
+	    !OLDMEM_BASE && sclp.hsa_size) {
+		memory_end = sclp.hsa_size;
 		memory_end &= PAGE_MASK;
 		memory_end_set = 1;
 	}
@@ -576,7 +576,7 @@ static void __init reserve_crashkernel(void)
 		crash_base = low;
 	} else {
 		/* Find suitable area in free memory */
-		low = max_t(unsigned long, crash_size, sclp_get_hsa_size());
+		low = max_t(unsigned long, crash_size, sclp.hsa_size);
 		high = crash_base ? crash_base + crash_size : ULONG_MAX;
 
 		if (crash_base && crash_base < low) {
@@ -640,19 +640,24 @@ static void __init check_initrd(void)
 }
 
 /*
- * Reserve all kernel text
+ * Reserve memory used for lowcore/command line/kernel image.
  */
 static void __init reserve_kernel(void)
 {
-	unsigned long start_pfn;
-	start_pfn = PFN_UP(__pa(&_end));
+	unsigned long start_pfn = PFN_UP(__pa(&_end));
 
+#ifdef CONFIG_DMA_API_DEBUG
 	/*
-	 * Reserve memory used for lowcore/command line/kernel image.
+	 * DMA_API_DEBUG code stumbles over addresses from the
+	 * range [_ehead, _stext]. Mark the memory as reserved
+	 * so it is not used for CONFIG_DMA_API_DEBUG=y.
 	 */
+	memblock_reserve(0, PFN_PHYS(start_pfn));
+#else
 	memblock_reserve(0, (unsigned long)_ehead);
 	memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn)
 			 - (unsigned long)_stext);
+#endif
 }
 
 static void __init reserve_elfcorehdr(void)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index efd2c1968000..0d9d59d4710e 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -601,7 +601,7 @@ static void __init smp_store_cpu_states(struct sclp_cpu_info *info)
 		/* No previous system present, normal boot. */
 		return;
 	/* Set multi-threading state to the previous system. */
-	pcpu_set_smt(sclp_get_mtid_prev());
+	pcpu_set_smt(sclp.mtid_prev);
 	/* Collect CPU states. */
 	cpu = 0;
 	for (i = 0; i < info->configured; i++) {
@@ -740,7 +740,7 @@ static void __init smp_detect_cpus(void)
 #endif
 
 	/* Set multi-threading state for the current system */
-	mtid = sclp_get_mtid(boot_cpu_type);
+	mtid = boot_cpu_type ? sclp.mtid : sclp.mtid_cp;
 	mtid = (mtid < smp_max_threads) ? mtid : smp_max_threads - 1;
 	pcpu_set_smt(mtid);
 
@@ -880,12 +880,13 @@ void __noreturn cpu_die(void)
 
 void __init smp_fill_possible_mask(void)
 {
-	unsigned int possible, sclp, cpu;
+	unsigned int possible, sclp_max, cpu;
 
-	sclp = min(smp_max_threads, sclp_get_mtid_max() + 1);
-	sclp = sclp_get_max_cpu()*sclp ?: nr_cpu_ids;
+	sclp_max = max(sclp.mtid, sclp.mtid_cp) + 1;
+	sclp_max = min(smp_max_threads, sclp_max);
+	sclp_max = sclp.max_cpu * sclp_max ?: nr_cpu_ids;
 	possible = setup_possible_cpus ?: nr_cpu_ids;
-	possible = min(possible, sclp);
+	possible = min(possible, sclp_max);
 	for (cpu = 0; cpu < possible && cpu < nr_cpu_ids; cpu++)
 		set_cpu_possible(cpu, true);
 }
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c
index d3236c9e226b..39e2f41b6cf0 100644
--- a/arch/s390/kernel/suspend.c
+++ b/arch/s390/kernel/suspend.c
@@ -9,10 +9,10 @@
 #include <linux/pfn.h>
 #include <linux/suspend.h>
 #include <linux/mm.h>
+#include <linux/pci.h>
 #include <asm/ctl_reg.h>
 #include <asm/ipl.h>
 #include <asm/cio.h>
-#include <asm/pci.h>
 #include <asm/sections.h>
 #include "entry.h"
 
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 170ddd2018b3..9e733d965e08 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -76,7 +76,7 @@ unsigned long long monotonic_clock(void)
 }
 EXPORT_SYMBOL(monotonic_clock);
 
-void tod_to_timeval(__u64 todval, struct timespec *xt)
+void tod_to_timeval(__u64 todval, struct timespec64 *xt)
 {
 	unsigned long long sec;
 
@@ -181,12 +181,12 @@ static void timing_alert_interrupt(struct ext_code ext_code,
 static void etr_reset(void);
 static void stp_reset(void);
 
-void read_persistent_clock(struct timespec *ts)
+void read_persistent_clock64(struct timespec64 *ts)
 {
 	tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts);
 }
 
-void read_boot_clock(struct timespec *ts)
+void read_boot_clock64(struct timespec64 *ts)
 {
 	tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts);
 }
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9e3779e3e496..7365e8a46032 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -241,21 +241,6 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
-static int handle_instruction_and_prog(struct kvm_vcpu *vcpu)
-{
-	int rc, rc2;
-
-	vcpu->stat.exit_instr_and_program++;
-	rc = handle_instruction(vcpu);
-	rc2 = handle_prog(vcpu);
-
-	if (rc == -EOPNOTSUPP)
-		vcpu->arch.sie_block->icptcode = 0x04;
-	if (rc)
-		return rc;
-	return rc2;
-}
-
 /**
  * handle_external_interrupt - used for external interruption interceptions
  *
@@ -355,7 +340,6 @@ static const intercept_handler_t intercept_funcs[] = {
 	[0x00 >> 2] = handle_noop,
 	[0x04 >> 2] = handle_instruction,
 	[0x08 >> 2] = handle_prog,
-	[0x0C >> 2] = handle_instruction_and_prog,
 	[0x10 >> 2] = handle_noop,
 	[0x14 >> 2] = handle_external_interrupt,
 	[0x18 >> 2] = handle_noop,
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 9de47265ef73..c98d89708e99 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -134,6 +134,8 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 
 	active_mask = pending_local_irqs(vcpu);
 	active_mask |= pending_floating_irqs(vcpu);
+	if (!active_mask)
+		return 0;
 
 	if (psw_extint_disabled(vcpu))
 		active_mask &= ~IRQ_PEND_EXT_MASK;
@@ -799,7 +801,7 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	uint8_t sigp_ctrl = vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sigp_ctrl;
 
-	if (!sclp_has_sigpif())
+	if (!sclp.has_sigpif)
 		return test_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs);
 
 	return (sigp_ctrl & SIGP_CTRL_C) &&
@@ -941,12 +943,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 	if (cpu_timer_irq_pending(vcpu))
 		set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs);
 
-	do {
-		irqs = deliverable_irqs(vcpu);
+	while ((irqs = deliverable_irqs(vcpu)) && !rc) {
 		/* bits are in the order of interrupt priority */
 		irq_type = find_first_bit(&irqs, IRQ_PEND_COUNT);
-		if (irq_type == IRQ_PEND_COUNT)
-			break;
 		if (is_ioirq(irq_type)) {
 			rc = __deliver_io(vcpu, irq_type);
 		} else {
@@ -958,9 +957,7 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 			}
 			rc = func(vcpu);
 		}
-		if (rc)
-			break;
-	} while (!rc);
+	}
 
 	set_intercept_indicators(vcpu);
 
@@ -1058,10 +1055,10 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	    kvm_get_vcpu(vcpu->kvm, src_id) == NULL)
 		return -EINVAL;
 
-	if (sclp_has_sigpif())
+	if (sclp.has_sigpif)
 		return __inject_extcall_sigpif(vcpu, src_id);
 
-	if (!test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
+	if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs))
 		return -EBUSY;
 	*extcall = irq->u.extcall;
 	atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
@@ -1340,12 +1337,54 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 	return 0;
 }
 
-static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
+/*
+ * Find a destination VCPU for a floating irq and kick it.
+ */
+static void __floating_irq_kick(struct kvm *kvm, u64 type)
 {
+	struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int;
 	struct kvm_s390_local_interrupt *li;
+	struct kvm_vcpu *dst_vcpu;
+	int sigcpu, online_vcpus, nr_tries = 0;
+
+	online_vcpus = atomic_read(&kvm->online_vcpus);
+	if (!online_vcpus)
+		return;
+
+	/* find idle VCPUs first, then round robin */
+	sigcpu = find_first_bit(fi->idle_mask, online_vcpus);
+	if (sigcpu == online_vcpus) {
+		do {
+			sigcpu = fi->next_rr_cpu;
+			fi->next_rr_cpu = (fi->next_rr_cpu + 1) % online_vcpus;
+			/* avoid endless loops if all vcpus are stopped */
+			if (nr_tries++ >= online_vcpus)
+				return;
+		} while (is_vcpu_stopped(kvm_get_vcpu(kvm, sigcpu)));
+	}
+	dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
+
+	/* make the VCPU drop out of the SIE, or wake it up if sleeping */
+	li = &dst_vcpu->arch.local_int;
+	spin_lock(&li->lock);
+	switch (type) {
+	case KVM_S390_MCHK:
+		atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
+		break;
+	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
+		atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags);
+		break;
+	default:
+		atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
+		break;
+	}
+	spin_unlock(&li->lock);
+	kvm_s390_vcpu_wakeup(dst_vcpu);
+}
+
+static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
+{
 	struct kvm_s390_float_interrupt *fi;
-	struct kvm_vcpu *dst_vcpu = NULL;
-	int sigcpu;
 	u64 type = READ_ONCE(inti->type);
 	int rc;
 
@@ -1373,32 +1412,8 @@ static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 	if (rc)
 		return rc;
 
-	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
-	if (sigcpu == KVM_MAX_VCPUS) {
-		do {
-			sigcpu = fi->next_rr_cpu++;
-			if (sigcpu == KVM_MAX_VCPUS)
-				sigcpu = fi->next_rr_cpu = 0;
-		} while (kvm_get_vcpu(kvm, sigcpu) == NULL);
-	}
-	dst_vcpu = kvm_get_vcpu(kvm, sigcpu);
-	li = &dst_vcpu->arch.local_int;
-	spin_lock(&li->lock);
-	switch (type) {
-	case KVM_S390_MCHK:
-		atomic_set_mask(CPUSTAT_STOP_INT, li->cpuflags);
-		break;
-	case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
-		atomic_set_mask(CPUSTAT_IO_INT, li->cpuflags);
-		break;
-	default:
-		atomic_set_mask(CPUSTAT_EXT_INT, li->cpuflags);
-		break;
-	}
-	spin_unlock(&li->lock);
-	kvm_s390_vcpu_wakeup(kvm_get_vcpu(kvm, sigcpu));
+	__floating_irq_kick(kvm, type);
 	return 0;
-
 }
 
 int kvm_s390_inject_vm(struct kvm *kvm,
@@ -1606,6 +1621,9 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm)
 	int i;
 
 	spin_lock(&fi->lock);
+	fi->pending_irqs = 0;
+	memset(&fi->srv_signal, 0, sizeof(fi->srv_signal));
+	memset(&fi->mchk, 0, sizeof(fi->mchk));
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
 		clear_irq_list(&fi->lists[i]);
 	for (i = 0; i < FIRQ_MAX_COUNT; i++)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8cd8e7b288c5..2078f92d15ac 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -36,6 +36,10 @@
 #include "kvm-s390.h"
 #include "gaccess.h"
 
+#define KMSG_COMPONENT "kvm-s390"
+#undef pr_fmt
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 #include "trace-s390.h"
@@ -110,7 +114,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[] = {
 	0xffe6fffbfcfdfc40UL,
-	0x005c800000000000UL,
+	0x005e800000000000UL,
 };
 
 unsigned long kvm_s390_fac_list_mask_size(void)
@@ -236,6 +240,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 {
 	int r;
 	unsigned long n;
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
 	int is_dirty = 0;
 
@@ -245,7 +250,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	if (log->slot >= KVM_USER_MEM_SLOTS)
 		goto out;
 
-	memslot = id_to_memslot(kvm->memslots, log->slot);
+	slots = kvm_memslots(kvm);
+	memslot = id_to_memslot(slots, log->slot);
 	r = -ENOENT;
 	if (!memslot->dirty_bitmap)
 		goto out;
@@ -454,10 +460,10 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 
 	mutex_lock(&kvm->lock);
 	kvm->arch.epoch = gtod - host_tod;
-	kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) {
+	kvm_s390_vcpu_block_all(kvm);
+	kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
 		cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
-		exit_sie(cur_vcpu);
-	}
+	kvm_s390_vcpu_unblock_all(kvm);
 	mutex_unlock(&kvm->lock);
 	return 0;
 }
@@ -604,7 +610,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 		goto out;
 	}
 	get_cpu_id((struct cpuid *) &mach->cpuid);
-	mach->ibc = sclp_get_ibc();
+	mach->ibc = sclp.ibc;
 	memcpy(&mach->fac_mask, kvm->arch.model.fac->mask,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
@@ -1068,7 +1074,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 
 	kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id);
-	kvm->arch.model.ibc = sclp_get_ibc() & 0x0fff;
+	kvm->arch.model.ibc = sclp.ibc & 0x0fff;
 
 	if (kvm_s390_crypto_init(kvm) < 0)
 		goto out_err;
@@ -1311,8 +1317,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
 						    CPUSTAT_SM |
-						    CPUSTAT_STOPPED |
-						    CPUSTAT_GED);
+						    CPUSTAT_STOPPED);
+
+	if (test_kvm_facility(vcpu->kvm, 78))
+		atomic_set_mask(CPUSTAT_GED2, &vcpu->arch.sie_block->cpuflags);
+	else if (test_kvm_facility(vcpu->kvm, 8))
+		atomic_set_mask(CPUSTAT_GED, &vcpu->arch.sie_block->cpuflags);
+
 	kvm_s390_vcpu_setup_model(vcpu);
 
 	vcpu->arch.sie_block->ecb   = 6;
@@ -1321,9 +1332,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.sie_block->ecb2  = 8;
 	vcpu->arch.sie_block->eca   = 0xC1002000U;
-	if (sclp_has_siif())
+	if (sclp.has_siif)
 		vcpu->arch.sie_block->eca |= 1;
-	if (sclp_has_sigpif())
+	if (sclp.has_sigpif)
 		vcpu->arch.sie_block->eca |= 0x10000000U;
 	if (test_kvm_facility(vcpu->kvm, 129)) {
 		vcpu->arch.sie_block->eca |= 0x00020000;
@@ -1409,16 +1420,28 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	return kvm_s390_vcpu_has_irq(vcpu, 0);
 }
 
-void s390_vcpu_block(struct kvm_vcpu *vcpu)
+void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu)
 {
 	atomic_set_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
+	exit_sie(vcpu);
 }
 
-void s390_vcpu_unblock(struct kvm_vcpu *vcpu)
+void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu)
 {
 	atomic_clear_mask(PROG_BLOCK_SIE, &vcpu->arch.sie_block->prog20);
 }
 
+static void kvm_s390_vcpu_request(struct kvm_vcpu *vcpu)
+{
+	atomic_set_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
+	exit_sie(vcpu);
+}
+
+static void kvm_s390_vcpu_request_handled(struct kvm_vcpu *vcpu)
+{
+	atomic_clear_mask(PROG_REQUEST, &vcpu->arch.sie_block->prog20);
+}
+
 /*
  * Kick a guest cpu out of SIE and wait until SIE is not running.
  * If the CPU is not running (e.g. waiting as idle) the function will
@@ -1430,11 +1453,11 @@ void exit_sie(struct kvm_vcpu *vcpu)
 		cpu_relax();
 }
 
-/* Kick a guest cpu out of SIE and prevent SIE-reentry */
-void exit_sie_sync(struct kvm_vcpu *vcpu)
+/* Kick a guest cpu out of SIE to process a request synchronously */
+void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
 {
-	s390_vcpu_block(vcpu);
-	exit_sie(vcpu);
+	kvm_make_request(req, vcpu);
+	kvm_s390_vcpu_request(vcpu);
 }
 
 static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
@@ -1447,8 +1470,7 @@ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
 		/* match against both prefix pages */
 		if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
 			VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
-			kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
-			exit_sie_sync(vcpu);
+			kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
 		}
 	}
 }
@@ -1720,8 +1742,10 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu)
 
 static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu)
 {
+	if (!vcpu->requests)
+		return 0;
 retry:
-	s390_vcpu_unblock(vcpu);
+	kvm_s390_vcpu_request_handled(vcpu);
 	/*
 	 * We use MMU_RELOAD just to re-arm the ipte notifier for the
 	 * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
@@ -1993,12 +2017,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		 * As PF_VCPU will be used in fault handler, between
 		 * guest_enter and guest_exit should be no uaccess.
 		 */
-		preempt_disable();
-		kvm_guest_enter();
-		preempt_enable();
+		local_irq_disable();
+		__kvm_guest_enter();
+		local_irq_enable();
 		exit_reason = sie64a(vcpu->arch.sie_block,
 				     vcpu->run->s.regs.gprs);
-		kvm_guest_exit();
+		local_irq_disable();
+		__kvm_guest_exit();
+		local_irq_enable();
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 
 		rc = vcpu_post_run(vcpu, exit_reason);
@@ -2068,7 +2094,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) {
 		kvm_s390_vcpu_start(vcpu);
 	} else if (is_vcpu_stopped(vcpu)) {
-		pr_err_ratelimited("kvm-s390: can't run stopped vcpu %d\n",
+		pr_err_ratelimited("can't run stopped vcpu %d\n",
 				   vcpu->vcpu_id);
 		return -EINVAL;
 	}
@@ -2206,8 +2232,7 @@ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr)
 static void __disable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
 	kvm_check_request(KVM_REQ_ENABLE_IBS, vcpu);
-	kvm_make_request(KVM_REQ_DISABLE_IBS, vcpu);
-	exit_sie_sync(vcpu);
+	kvm_s390_sync_request(KVM_REQ_DISABLE_IBS, vcpu);
 }
 
 static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
@@ -2223,8 +2248,7 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
 static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
 {
 	kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
-	kvm_make_request(KVM_REQ_ENABLE_IBS, vcpu);
-	exit_sie_sync(vcpu);
+	kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
 }
 
 void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu)
@@ -2563,7 +2587,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_userspace_memory_region *mem,
 				   enum kvm_mr_change change)
 {
 	/* A few sanity checks. We can have memory slots which have to be
@@ -2581,8 +2605,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
 	int rc;
@@ -2601,7 +2626,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr,
 		mem->guest_phys_addr, mem->memory_size);
 	if (rc)
-		printk(KERN_WARNING "kvm-s390: failed to commit memory region\n");
+		pr_warn("failed to commit memory region\n");
 	return;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index ca108b90ae56..c5704786e473 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -211,10 +211,10 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu);
-void s390_vcpu_block(struct kvm_vcpu *vcpu);
-void s390_vcpu_unblock(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu);
 void exit_sie(struct kvm_vcpu *vcpu);
-void exit_sie_sync(struct kvm_vcpu *vcpu);
+void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu);
 int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
 /* is cmma enabled */
@@ -228,6 +228,25 @@ int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
 			     struct kvm_s390_pgm_info *pgm_info);
 
+static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	WARN_ON(!mutex_is_locked(&kvm->lock));
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_s390_vcpu_block(vcpu);
+}
+
+static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_s390_vcpu_unblock(vcpu);
+}
+
 /**
  * kvm_s390_inject_prog_cond - conditionally inject a program check
  * @vcpu: virtual cpu
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index d22d8ee1ff9d..ad4242245771 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -698,10 +698,14 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 	case 0x00001000:
 		end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
 		break;
-	/* We dont support EDAT2
 	case 0x00002000:
+		/* only support 2G frame size if EDAT2 is available and we are
+		   not in 24-bit addressing mode */
+		if (!test_kvm_facility(vcpu->kvm, 78) ||
+		    psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_24BIT)
+			return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 		end = (start + (1UL << 31)) & ~((1UL << 31) - 1);
-		break;*/
+		break;
 	default:
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 	}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 76515bcea2f1..4c8f5d7f9c23 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -399,7 +399,7 @@ static inline int do_exception(struct pt_regs *regs, int access)
 	 * user context.
 	 */
 	fault = VM_FAULT_BADCONTEXT;
-	if (unlikely(!user_space_fault(regs) || in_atomic() || !mm))
+	if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm))
 		goto out;
 
 	address = trans_exc_code & __FAIL_ADDR_MASK;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 80875c43a4a4..76e873748b56 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -213,7 +213,7 @@ unsigned long memory_block_size_bytes(void)
 	 * Make sure the memory block size is always greater
 	 * or equal than the memory increment size.
 	 */
-	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp_get_rzm());
+	return max_t(unsigned long, MIN_MEMORY_BLOCK_SIZE, sclp.rzm);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/mem_detect.c b/arch/s390/mm/mem_detect.c
index 0f3604395805..e00f0d5d296d 100644
--- a/arch/s390/mm/mem_detect.c
+++ b/arch/s390/mm/mem_detect.c
@@ -31,8 +31,8 @@ void __init detect_memory_memblock(void)
 	unsigned long addr, size;
 	int type;
 
-	rzm = sclp_get_rzm();
-	rnmax = sclp_get_rnmax();
+	rzm = sclp.rzm;
+	rnmax = sclp.rnmax;
 	memsize = rzm * rnmax;
 	if (!rzm)
 		rzm = 1ULL << 17;
diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h
index ba8593a515ba..f6498eec9ee1 100644
--- a/arch/s390/net/bpf_jit.h
+++ b/arch/s390/net/bpf_jit.h
@@ -28,6 +28,9 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
  *	      | old backchain |     |
  *	      +---------------+     |
  *	      |   r15 - r6    |     |
+ *	      +---------------+     |
+ *	      | 4 byte align  |     |
+ *	      | tail_call_cnt |     |
  * BFP	   -> +===============+     |
  *	      |		      |     |
  *	      |   BPF stack   |     |
@@ -46,12 +49,17 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[];
  * R15	   -> +---------------+     + low
  *
  * We get 160 bytes stack space from calling function, but only use
- * 11 * 8 byte (old backchain + r15 - r6) for storing registers.
+ * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt.
  */
-#define STK_OFF (MAX_BPF_STACK + 8 + 4 + 4 + (160 - 11 * 8))
+#define STK_SPACE	(MAX_BPF_STACK + 8 + 4 + 4 + 160)
+#define STK_160_UNUSED	(160 - 12 * 8)
+#define STK_OFF		(STK_SPACE - STK_160_UNUSED)
 #define STK_OFF_TMP	160	/* Offset of tmp buffer on stack */
 #define STK_OFF_HLEN	168	/* Offset of SKB header length on stack */
 
+#define STK_OFF_R6	(160 - 11 * 8)	/* Offset of r6 on stack */
+#define STK_OFF_TCCNT	(160 - 12 * 8)	/* Offset of tail_call_cnt on stack */
+
 /* Offset to skip condition code check */
 #define OFF_OK		4
 
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 7690dc8e1ab5..d3766dd67e23 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -21,6 +21,7 @@
 #include <linux/netdevice.h>
 #include <linux/filter.h>
 #include <linux/init.h>
+#include <linux/bpf.h>
 #include <asm/cacheflush.h>
 #include <asm/dis.h>
 #include "bpf_jit.h"
@@ -40,6 +41,8 @@ struct bpf_jit {
 	int base_ip;		/* Base address for literal pool */
 	int ret0_ip;		/* Address of return 0 */
 	int exit_ip;		/* Address of exit */
+	int tail_call_start;	/* Tail call start offset */
+	int labels[1];		/* Labels for local jumps */
 };
 
 #define BPF_SIZE_MAX	4096	/* Max size for program */
@@ -49,6 +52,7 @@ struct bpf_jit {
 #define SEEN_RET0	4	/* ret0_ip points to a valid return 0 */
 #define SEEN_LITERAL	8	/* code uses literals */
 #define SEEN_FUNC	16	/* calls C functions */
+#define SEEN_TAIL_CALL	32	/* code uses tail calls */
 #define SEEN_STACK	(SEEN_FUNC | SEEN_MEM | SEEN_SKB)
 
 /*
@@ -60,6 +64,7 @@ struct bpf_jit {
 #define REG_L		(__MAX_BPF_REG+3)	/* Literal pool register */
 #define REG_15		(__MAX_BPF_REG+4)	/* Register 15 */
 #define REG_0		REG_W0			/* Register 0 */
+#define REG_1		REG_W1			/* Register 1 */
 #define REG_2		BPF_REG_1		/* Register 2 */
 #define REG_14		BPF_REG_0		/* Register 14 */
 
@@ -223,6 +228,24 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1)
 	REG_SET_SEEN(b3);					\
 })
 
+#define EMIT6_PCREL_LABEL(op1, op2, b1, b2, label, mask)	\
+({								\
+	int rel = (jit->labels[label] - jit->prg) >> 1;		\
+	_EMIT6(op1 | reg(b1, b2) << 16 | (rel & 0xffff),	\
+	       op2 | mask << 12);				\
+	REG_SET_SEEN(b1);					\
+	REG_SET_SEEN(b2);					\
+})
+
+#define EMIT6_PCREL_IMM_LABEL(op1, op2, b1, imm, label, mask)	\
+({								\
+	int rel = (jit->labels[label] - jit->prg) >> 1;		\
+	_EMIT6(op1 | (reg_high(b1) | mask) << 16 |		\
+		(rel & 0xffff), op2 | (imm & 0xff) << 8);	\
+	REG_SET_SEEN(b1);					\
+	BUILD_BUG_ON(((unsigned long) imm) > 0xff);		\
+})
+
 #define EMIT6_PCREL(op1, op2, b1, b2, i, off, mask)		\
 ({								\
 	/* Branch instruction needs 6 bytes */			\
@@ -286,7 +309,7 @@ static void jit_fill_hole(void *area, unsigned int size)
  */
 static void save_regs(struct bpf_jit *jit, u32 rs, u32 re)
 {
-	u32 off = 72 + (rs - 6) * 8;
+	u32 off = STK_OFF_R6 + (rs - 6) * 8;
 
 	if (rs == re)
 		/* stg %rs,off(%r15) */
@@ -301,7 +324,7 @@ static void save_regs(struct bpf_jit *jit, u32 rs, u32 re)
  */
 static void restore_regs(struct bpf_jit *jit, u32 rs, u32 re)
 {
-	u32 off = 72 + (rs - 6) * 8;
+	u32 off = STK_OFF_R6 + (rs - 6) * 8;
 
 	if (jit->seen & SEEN_STACK)
 		off += STK_OFF;
@@ -374,6 +397,16 @@ static void save_restore_regs(struct bpf_jit *jit, int op)
  */
 static void bpf_jit_prologue(struct bpf_jit *jit)
 {
+	if (jit->seen & SEEN_TAIL_CALL) {
+		/* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */
+		_EMIT6(0xd703f000 | STK_OFF_TCCNT, 0xf000 | STK_OFF_TCCNT);
+	} else {
+		/* j tail_call_start: NOP if no tail calls are used */
+		EMIT4_PCREL(0xa7f40000, 6);
+		_EMIT2(0);
+	}
+	/* Tail calls have to skip above initialization */
+	jit->tail_call_start = jit->prg;
 	/* Save registers */
 	save_restore_regs(jit, REGS_SAVE);
 	/* Setup literal pool */
@@ -384,13 +417,16 @@ static void bpf_jit_prologue(struct bpf_jit *jit)
 	}
 	/* Setup stack and backchain */
 	if (jit->seen & SEEN_STACK) {
-		/* lgr %bfp,%r15 (BPF frame pointer) */
-		EMIT4(0xb9040000, BPF_REG_FP, REG_15);
+		if (jit->seen & SEEN_FUNC)
+			/* lgr %w1,%r15 (backchain) */
+			EMIT4(0xb9040000, REG_W1, REG_15);
+		/* la %bfp,STK_160_UNUSED(%r15) (BPF frame pointer) */
+		EMIT4_DISP(0x41000000, BPF_REG_FP, REG_15, STK_160_UNUSED);
 		/* aghi %r15,-STK_OFF */
 		EMIT4_IMM(0xa70b0000, REG_15, -STK_OFF);
 		if (jit->seen & SEEN_FUNC)
-			/* stg %bfp,152(%r15) (backchain) */
-			EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_FP, REG_0,
+			/* stg %w1,152(%r15) (backchain) */
+			EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
 				      REG_15, 152);
 	}
 	/*
@@ -443,8 +479,11 @@ static void bpf_jit_epilogue(struct bpf_jit *jit)
 
 /*
  * Compile one eBPF instruction into s390x code
+ *
+ * NOTE: Use noinline because for gcov (-fprofile-arcs) gcc allocates a lot of
+ * stack space for the large switch statement.
  */
-static int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
+static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
 {
 	struct bpf_insn *insn = &fp->insnsi[i];
 	int jmp_off, last, insn_count = 1;
@@ -588,8 +627,8 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
 		EMIT4(0xb9160000, dst_reg, rc_reg);
 		break;
 	}
-	case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / (u32) src */
-	case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % (u32) src */
+	case BPF_ALU64 | BPF_DIV | BPF_X: /* dst = dst / src */
+	case BPF_ALU64 | BPF_MOD | BPF_X: /* dst = dst % src */
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
@@ -602,10 +641,8 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
 		EMIT4_IMM(0xa7090000, REG_W0, 0);
 		/* lgr %w1,%dst */
 		EMIT4(0xb9040000, REG_W1, dst_reg);
-		/* llgfr %dst,%src (u32 cast) */
-		EMIT4(0xb9160000, dst_reg, src_reg);
 		/* dlgr %w0,%dst */
-		EMIT4(0xb9870000, REG_W0, dst_reg);
+		EMIT4(0xb9870000, REG_W0, src_reg);
 		/* lgr %dst,%rc */
 		EMIT4(0xb9040000, dst_reg, rc_reg);
 		break;
@@ -632,8 +669,8 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
 		EMIT4(0xb9160000, dst_reg, rc_reg);
 		break;
 	}
-	case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / (u32) imm */
-	case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % (u32) imm */
+	case BPF_ALU64 | BPF_DIV | BPF_K: /* dst = dst / imm */
+	case BPF_ALU64 | BPF_MOD | BPF_K: /* dst = dst % imm */
 	{
 		int rc_reg = BPF_OP(insn->code) == BPF_DIV ? REG_W1 : REG_W0;
 
@@ -649,7 +686,7 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
 		EMIT4(0xb9040000, REG_W1, dst_reg);
 		/* dlg %w0,<d(imm)>(%l) */
 		EMIT6_DISP_LH(0xe3000000, 0x0087, REG_W0, REG_0, REG_L,
-			      EMIT_CONST_U64((u32) imm));
+			      EMIT_CONST_U64(imm));
 		/* lgr %dst,%rc */
 		EMIT4(0xb9040000, dst_reg, rc_reg);
 		break;
@@ -947,6 +984,75 @@ static int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i)
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
 		break;
 	}
+	case BPF_JMP | BPF_CALL | BPF_X:
+		/*
+		 * Implicit input:
+		 *  B1: pointer to ctx
+		 *  B2: pointer to bpf_array
+		 *  B3: index in bpf_array
+		 */
+		jit->seen |= SEEN_TAIL_CALL;
+
+		/*
+		 * if (index >= array->map.max_entries)
+		 *         goto out;
+		 */
+
+		/* llgf %w1,map.max_entries(%b2) */
+		EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2,
+			      offsetof(struct bpf_array, map.max_entries));
+		/* clgrj %b3,%w1,0xa,label0: if %b3 >= %w1 goto out */
+		EMIT6_PCREL_LABEL(0xec000000, 0x0065, BPF_REG_3,
+				  REG_W1, 0, 0xa);
+
+		/*
+		 * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT)
+		 *         goto out;
+		 */
+
+		if (jit->seen & SEEN_STACK)
+			off = STK_OFF_TCCNT + STK_OFF;
+		else
+			off = STK_OFF_TCCNT;
+		/* lhi %w0,1 */
+		EMIT4_IMM(0xa7080000, REG_W0, 1);
+		/* laal %w1,%w0,off(%r15) */
+		EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off);
+		/* clij %w1,MAX_TAIL_CALL_CNT,0x2,label0 */
+		EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007f, REG_W1,
+				      MAX_TAIL_CALL_CNT, 0, 0x2);
+
+		/*
+		 * prog = array->prog[index];
+		 * if (prog == NULL)
+		 *         goto out;
+		 */
+
+		/* sllg %r1,%b3,3: %r1 = index * 8 */
+		EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, BPF_REG_3, REG_0, 3);
+		/* lg %r1,prog(%b2,%r1) */
+		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2,
+			      REG_1, offsetof(struct bpf_array, prog));
+		/* clgij %r1,0,0x8,label0 */
+		EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007d, REG_1, 0, 0, 0x8);
+
+		/*
+		 * Restore registers before calling function
+		 */
+		save_restore_regs(jit, REGS_RESTORE);
+
+		/*
+		 * goto *(prog->bpf_func + tail_call_start);
+		 */
+
+		/* lg %r1,bpf_func(%r1) */
+		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, REG_1, REG_0,
+			      offsetof(struct bpf_prog, bpf_func));
+		/* bc 0xf,tail_call_start(%r1) */
+		_EMIT4(0x47f01000 + jit->tail_call_start);
+		/* out: */
+		jit->labels[0] = jit->prg;
+		break;
 	case BPF_JMP | BPF_EXIT: /* return b0 */
 		last = (i == fp->len - 1) ? 1 : 0;
 		if (last && !(jit->seen & SEEN_RET0))
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index 460fdb21cf61..ed2394dd14e9 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -75,7 +75,13 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
 	zpci_err_hex(ccdf, sizeof(*ccdf));
 
 	switch (ccdf->pec) {
-	case 0x0301: /* Standby -> Configured */
+	case 0x0301: /* Reserved|Standby -> Configured */
+		if (!zdev) {
+			ret = clp_add_pci_device(ccdf->fid, ccdf->fh, 0);
+			if (ret)
+				break;
+			zdev = get_zdev_by_fid(ccdf->fid);
+		}
 		if (!zdev || zdev->state != ZPCI_FN_STATE_STANDBY)
 			break;
 		zdev->state = ZPCI_FN_STATE_CONFIGURED;
diff --git a/arch/score/include/asm/cmpxchg.h b/arch/score/include/asm/cmpxchg.h
index f384839c3ee5..cc3f6420b71c 100644
--- a/arch/score/include/asm/cmpxchg.h
+++ b/arch/score/include/asm/cmpxchg.h
@@ -42,8 +42,6 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m,
 					(unsigned long)(o),	\
 					(unsigned long)(n)))
 
-#define __HAVE_ARCH_CMPXCHG	1
-
 #include <asm-generic/cmpxchg-local.h>
 
 #endif /* _ASM_SCORE_CMPXCHG_H */
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index ab66ddde777b..20a3591225cc 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -36,7 +36,8 @@
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -61,7 +62,8 @@
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -79,7 +81,8 @@
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -98,7 +101,8 @@
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -119,7 +123,8 @@
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/score/lib/string.S b/arch/score/lib/string.S
index 00b7d3a2fc60..16efa3ad037f 100644
--- a/arch/score/lib/string.S
+++ b/arch/score/lib/string.S
@@ -175,10 +175,10 @@ ENTRY(__clear_user)
 	br	r3
 
 	.section .fixup, "ax"
+99:
 	br	r3
 	.previous
 	.section __ex_table, "a"
 	.align	2
-99:
 	.word	0b, 99b
 	.previous
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 6860beb2a280..37a6c2e0e969 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -34,6 +34,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/uaccess.h>
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -73,7 +74,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
 	* If we're in an interrupt or have no user
 	* context, we must not take the fault..
 	*/
-	if (in_atomic() || !mm)
+	if (pagefault_disabled() || !mm)
 		goto bad_area_nosemaphore;
 
 	if (user_mode(regs))
diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c
index 4ce95a001b80..45361946460f 100644
--- a/arch/sh/drivers/pci/ops-sh5.c
+++ b/arch/sh/drivers/pci/ops-sh5.c
@@ -18,7 +18,6 @@
 #include <linux/delay.h>
 #include <linux/types.h>
 #include <linux/irq.h>
-#include <asm/pci.h>
 #include <asm/io.h>
 #include "pci-sh5.h"
 
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 16c1e721bf54..8229114c6a58 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -20,7 +20,6 @@
 #include <linux/types.h>
 #include <linux/irq.h>
 #include <cpu/irq.h>
-#include <asm/pci.h>
 #include <asm/io.h>
 #include "pci-sh5.h"
 
diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h
index 43715308b068..bf91037db4e0 100644
--- a/arch/sh/include/asm/barrier.h
+++ b/arch/sh/include/asm/barrier.h
@@ -32,7 +32,7 @@
 #define ctrl_barrier()	__asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop")
 #endif
 
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 
 #include <asm-generic/barrier.h>
 
diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h
index f6bd1406b897..85c97b188d71 100644
--- a/arch/sh/include/asm/cmpxchg.h
+++ b/arch/sh/include/asm/cmpxchg.h
@@ -46,8 +46,6 @@ extern void __xchg_called_with_bad_pointer(void);
  * if something tries to do an invalid cmpxchg(). */
 extern void __cmpxchg_called_with_bad_pointer(void);
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
 		unsigned long new, int size)
 {
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5b4511552998..e343dbd02e41 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -86,24 +86,6 @@ extern void pcibios_set_master(struct pci_dev *dev);
  * direct memory write.
  */
 #define PCI_DISABLE_MWI
-
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-
-	if (byte == 0)
-		cacheline_size = L1_CACHE_BYTES;
-	else
-		cacheline_size = byte << 2;
-
-	*strat = PCI_DMA_BURST_MULTIPLE;
-	*strategy_parameter = cacheline_size;
-}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7734.c b/arch/sh/kernel/cpu/sh4a/clock-sh7734.c
index 1fdf1ee672de..7f54bf2f453d 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7734.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7734.c
@@ -246,8 +246,7 @@ int __init arch_clk_init(void)
 	for (i = 0; i < ARRAY_SIZE(main_clks); i++)
 		ret |= clk_register(main_clks[i]);
 
-	for (i = 0; i < ARRAY_SIZE(lookups); i++)
-		clkdev_add(&lookups[i]);
+	clkdev_add_table(lookups, ARRAY_SIZE(lookups));
 
 	if (!ret)
 		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7757.c b/arch/sh/kernel/cpu/sh4a/clock-sh7757.c
index 9a28fdb36387..e40ec2c97ad1 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7757.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7757.c
@@ -141,8 +141,8 @@ int __init arch_clk_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
-	for (i = 0; i < ARRAY_SIZE(lookups); i++)
-		clkdev_add(&lookups[i]);
+
+	clkdev_add_table(lookups, ARRAY_SIZE(lookups));
 
 	if (!ret)
 		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 17d0ea55a5a2..8eb6e62340c9 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -164,8 +164,8 @@ int __init arch_clk_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
-	for (i = 0; i < ARRAY_SIZE(lookups); i++)
-		clkdev_add(&lookups[i]);
+
+	clkdev_add_table(lookups, ARRAY_SIZE(lookups));
 
 	if (!ret)
 		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index bec2a83f1ba5..5e50e7ebeff0 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -179,8 +179,8 @@ int __init arch_clk_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
-	for (i = 0; i < ARRAY_SIZE(lookups); i++)
-		clkdev_add(&lookups[i]);
+
+	clkdev_add_table(lookups, ARRAY_SIZE(lookups));
 
 	if (!ret)
 		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 9a49a44f6f94..605221d1448a 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -138,8 +138,8 @@ int __init arch_clk_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
-	for (i = 0; i < ARRAY_SIZE(lookups); i++)
-		clkdev_add(&lookups[i]);
+
+	clkdev_add_table(lookups, ARRAY_SIZE(lookups));
 
 	if (!ret)
 		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index a58fec9b55e0..79d8276377d1 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -17,6 +17,7 @@
 #include <linux/kprobes.h>
 #include <linux/perf_event.h>
 #include <linux/kdebug.h>
+#include <linux/uaccess.h>
 #include <asm/io_trapped.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -438,9 +439,9 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * with pagefaults disabled then we must not take the fault:
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c
index b688731d7ede..c9d2b922734b 100644
--- a/arch/sparc/crypto/md5_glue.c
+++ b/arch/sparc/crypto/md5_glue.c
@@ -33,10 +33,10 @@ static int md5_sparc64_init(struct shash_desc *desc)
 {
 	struct md5_state *mctx = shash_desc_ctx(desc);
 
-	mctx->hash[0] = cpu_to_le32(0x67452301);
-	mctx->hash[1] = cpu_to_le32(0xefcdab89);
-	mctx->hash[2] = cpu_to_le32(0x98badcfe);
-	mctx->hash[3] = cpu_to_le32(0x10325476);
+	mctx->hash[0] = cpu_to_le32(MD5_H0);
+	mctx->hash[1] = cpu_to_le32(MD5_H1);
+	mctx->hash[2] = cpu_to_le32(MD5_H2);
+	mctx->hash[3] = cpu_to_le32(MD5_H3);
 	mctx->byte_count = 0;
 
 	return 0;
diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h
index 76648941fea7..809941e33e12 100644
--- a/arch/sparc/include/asm/barrier_64.h
+++ b/arch/sparc/include/asm/barrier_64.h
@@ -40,8 +40,8 @@ do {	__asm__ __volatile__("ba,pt	%%xcc, 1f\n\t" \
 #define dma_rmb()	rmb()
 #define dma_wmb()	wmb()
 
-#define set_mb(__var, __value) \
-	do { __var = __value; membar_safe("#StoreLoad"); } while(0)
+#define smp_store_mb(__var, __value) \
+	do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0)
 
 #ifdef CONFIG_SMP
 #define smp_mb()	mb()
diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h
index d38b52dca216..83ffb83c5397 100644
--- a/arch/sparc/include/asm/cmpxchg_32.h
+++ b/arch/sparc/include/asm/cmpxchg_32.h
@@ -34,7 +34,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int
  *
  * Cribbed from <asm-parisc/atomic.h>
  */
-#define __HAVE_ARCH_CMPXCHG	1
 
 /* bug catcher for when unsupported size is used - won't link */
 void __cmpxchg_called_with_bad_pointer(void);
diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h
index 0e1ed6cfbf68..faa2f61058c2 100644
--- a/arch/sparc/include/asm/cmpxchg_64.h
+++ b/arch/sparc/include/asm/cmpxchg_64.h
@@ -65,8 +65,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 
 #include <asm-generic/cmpxchg-local.h>
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 static inline unsigned long
 __cmpxchg_u32(volatile int *m, int old, int new)
 {
diff --git a/arch/sparc/include/asm/cpudata_64.h b/arch/sparc/include/asm/cpudata_64.h
index a6e424d185d0..a6cfdabb6054 100644
--- a/arch/sparc/include/asm/cpudata_64.h
+++ b/arch/sparc/include/asm/cpudata_64.h
@@ -24,7 +24,8 @@ typedef struct {
 	unsigned int	icache_line_size;
 	unsigned int	ecache_size;
 	unsigned int	ecache_line_size;
-	int		core_id;
+	unsigned short	sock_id;
+	unsigned short	core_id;
 	int		proc_id;
 } cpuinfo_sparc;
 
diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h
index 407ac14295f4..57f26c398dc9 100644
--- a/arch/sparc/include/asm/io_32.h
+++ b/arch/sparc/include/asm/io_32.h
@@ -129,6 +129,7 @@ static inline void sbus_memcpy_toio(volatile void __iomem *dst,
 void __iomem *ioremap(unsigned long offset, unsigned long size);
 #define ioremap_nocache(X,Y)	ioremap((X),(Y))
 #define ioremap_wc(X,Y)		ioremap((X),(Y))
+#define ioremap_wt(X,Y)		ioremap((X),(Y))
 void iounmap(volatile void __iomem *addr);
 
 /* Create a virtual mapping cookie for an IO port range */
diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h
index 50d4840d9aeb..c32fa3f752c8 100644
--- a/arch/sparc/include/asm/io_64.h
+++ b/arch/sparc/include/asm/io_64.h
@@ -402,6 +402,7 @@ static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
 
 #define ioremap_nocache(X,Y)		ioremap((X),(Y))
 #define ioremap_wc(X,Y)			ioremap((X),(Y))
+#define ioremap_wt(X,Y)			ioremap((X),(Y))
 
 static inline void iounmap(volatile void __iomem *addr)
 {
diff --git a/arch/sparc/include/asm/pci_32.h b/arch/sparc/include/asm/pci_32.h
index 53e9b4987db0..b7c092df3134 100644
--- a/arch/sparc/include/asm/pci_32.h
+++ b/arch/sparc/include/asm/pci_32.h
@@ -22,16 +22,6 @@
 
 struct pci_dev;
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #endif /* __KERNEL__ */
 
 #ifndef CONFIG_LEON_PCI
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index bd00a6226169..022d16008a00 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -31,25 +31,6 @@
 #define PCI64_REQUIRED_MASK	(~(u64)0)
 #define PCI64_ADDR_BASE		0xfffc000000000000UL
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	unsigned long cacheline_size;
-	u8 byte;
-
-	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
-	if (byte == 0)
-		cacheline_size = 1024;
-	else
-		cacheline_size = (int) byte * 4;
-
-	*strat = PCI_DMA_BURST_BOUNDARY;
-	*strategy_parameter = cacheline_size;
-}
-#endif
-
 /* Return the index of the PCI controller for device PDEV. */
 
 int pci_domain_nr(struct pci_bus *bus);
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index dc165ebdf05a..2a52c91d2c8a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -308,12 +308,26 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t prot)
 	"	sllx		%1, 32, %1\n"
 	"	or		%0, %1, %0\n"
 	"	.previous\n"
+	"	.section	.sun_m7_2insn_patch, \"ax\"\n"
+	"	.word		661b\n"
+	"	sethi		%%uhi(%4), %1\n"
+	"	sethi		%%hi(%4), %0\n"
+	"	.word		662b\n"
+	"	or		%1, %%ulo(%4), %1\n"
+	"	or		%0, %%lo(%4), %0\n"
+	"	.word		663b\n"
+	"	sllx		%1, 32, %1\n"
+	"	or		%0, %1, %0\n"
+	"	.previous\n"
 	: "=r" (mask), "=r" (tmp)
 	: "i" (_PAGE_PADDR_4U | _PAGE_MODIFIED_4U | _PAGE_ACCESSED_4U |
 	       _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_E_4U |
 	       _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4U),
 	  "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V |
 	       _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_E_4V |
+	       _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4V),
+	  "i" (_PAGE_PADDR_4V | _PAGE_MODIFIED_4V | _PAGE_ACCESSED_4V |
+	       _PAGE_CP_4V | _PAGE_E_4V |
 	       _PAGE_SPECIAL | _PAGE_PMD_HUGE | _PAGE_SZALL_4V));
 
 	return __pte((pte_val(pte) & mask) | (pgprot_val(prot) & ~mask));
@@ -342,9 +356,15 @@ static inline pgprot_t pgprot_noncached(pgprot_t prot)
 	"	andn		%0, %4, %0\n"
 	"	or		%0, %5, %0\n"
 	"	.previous\n"
+	"	.section	.sun_m7_2insn_patch, \"ax\"\n"
+	"	.word		661b\n"
+	"	andn		%0, %6, %0\n"
+	"	or		%0, %5, %0\n"
+	"	.previous\n"
 	: "=r" (val)
 	: "0" (val), "i" (_PAGE_CP_4U | _PAGE_CV_4U), "i" (_PAGE_E_4U),
-	             "i" (_PAGE_CP_4V | _PAGE_CV_4V), "i" (_PAGE_E_4V));
+	             "i" (_PAGE_CP_4V | _PAGE_CV_4V), "i" (_PAGE_E_4V),
+	             "i" (_PAGE_CP_4V));
 
 	return __pgprot(val);
 }
diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h
index ed8f071132e4..01d17046225a 100644
--- a/arch/sparc/include/asm/topology_64.h
+++ b/arch/sparc/include/asm/topology_64.h
@@ -40,11 +40,12 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
 #ifdef CONFIG_SMP
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
-#define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
-#define topology_thread_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)		(&cpu_core_sib_map[cpu])
+#define topology_sibling_cpumask(cpu)		(&per_cpu(cpu_sibling_map, cpu))
 #endif /* CONFIG_SMP */
 
 extern cpumask_t cpu_core_map[NR_CPUS];
+extern cpumask_t cpu_core_sib_map[NR_CPUS];
 static inline const struct cpumask *cpu_coregroup_mask(int cpu)
 {
         return &cpu_core_map[cpu];
diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h
index 6fd4436d32f0..ec9c04de3664 100644
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -79,6 +79,8 @@ struct sun4v_2insn_patch_entry {
 };
 extern struct sun4v_2insn_patch_entry __sun4v_2insn_patch,
 	__sun4v_2insn_patch_end;
+extern struct sun4v_2insn_patch_entry __sun_m7_2insn_patch,
+	__sun_m7_2insn_patch_end;
 
 
 #endif /* !(__ASSEMBLY__) */
diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h
index 07cc49e541f4..0f679421b468 100644
--- a/arch/sparc/kernel/entry.h
+++ b/arch/sparc/kernel/entry.h
@@ -69,6 +69,8 @@ void sun4v_patch_1insn_range(struct sun4v_1insn_patch_entry *,
 			     struct sun4v_1insn_patch_entry *);
 void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *,
 			     struct sun4v_2insn_patch_entry *);
+void sun_m7_patch_2insn_range(struct sun4v_2insn_patch_entry *,
+			     struct sun4v_2insn_patch_entry *);
 extern unsigned int dcache_parity_tl1_occurred;
 extern unsigned int icache_parity_tl1_occurred;
 
diff --git a/arch/sparc/kernel/leon_pci_grpci2.c b/arch/sparc/kernel/leon_pci_grpci2.c
index 94e392bdee7d..814fb1729b12 100644
--- a/arch/sparc/kernel/leon_pci_grpci2.c
+++ b/arch/sparc/kernel/leon_pci_grpci2.c
@@ -723,7 +723,6 @@ static int grpci2_of_probe(struct platform_device *ofdev)
 		err = -ENOMEM;
 		goto err1;
 	}
-	memset(grpci2priv, 0, sizeof(*grpci2priv));
 	priv->regs = regs;
 	priv->irq = ofdev->archdata.irqs[0]; /* BASE IRQ */
 	priv->irq_mode = (capability & STS_IRQMODE) >> STS_IRQMODE_BIT;
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 26c80e18d7b1..6f80936e0eea 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -614,45 +614,68 @@ static void fill_in_one_cache(cpuinfo_sparc *c, struct mdesc_handle *hp, u64 mp)
 	}
 }
 
-static void mark_core_ids(struct mdesc_handle *hp, u64 mp, int core_id)
+static void find_back_node_value(struct mdesc_handle *hp, u64 node,
+				 char *srch_val,
+				 void (*func)(struct mdesc_handle *, u64, int),
+				 u64 val, int depth)
 {
-	u64 a;
-
-	mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_BACK) {
-		u64 t = mdesc_arc_target(hp, a);
-		const char *name;
-		const u64 *id;
+	u64 arc;
 
-		name = mdesc_node_name(hp, t);
-		if (!strcmp(name, "cpu")) {
-			id = mdesc_get_property(hp, t, "id", NULL);
-			if (*id < NR_CPUS)
-				cpu_data(*id).core_id = core_id;
-		} else {
-			u64 j;
+	/* Since we have an estimate of recursion depth, do a sanity check. */
+	if (depth == 0)
+		return;
 
-			mdesc_for_each_arc(j, hp, t, MDESC_ARC_TYPE_BACK) {
-				u64 n = mdesc_arc_target(hp, j);
-				const char *n_name;
+	mdesc_for_each_arc(arc, hp, node, MDESC_ARC_TYPE_BACK) {
+		u64 n = mdesc_arc_target(hp, arc);
+		const char *name = mdesc_node_name(hp, n);
 
-				n_name = mdesc_node_name(hp, n);
-				if (strcmp(n_name, "cpu"))
-					continue;
+		if (!strcmp(srch_val, name))
+			(*func)(hp, n, val);
 
-				id = mdesc_get_property(hp, n, "id", NULL);
-				if (*id < NR_CPUS)
-					cpu_data(*id).core_id = core_id;
-			}
-		}
+		find_back_node_value(hp, n, srch_val, func, val, depth-1);
 	}
 }
 
+static void __mark_core_id(struct mdesc_handle *hp, u64 node,
+			   int core_id)
+{
+	const u64 *id = mdesc_get_property(hp, node, "id", NULL);
+
+	if (*id < num_possible_cpus())
+		cpu_data(*id).core_id = core_id;
+}
+
+static void __mark_sock_id(struct mdesc_handle *hp, u64 node,
+			   int sock_id)
+{
+	const u64 *id = mdesc_get_property(hp, node, "id", NULL);
+
+	if (*id < num_possible_cpus())
+		cpu_data(*id).sock_id = sock_id;
+}
+
+static void mark_core_ids(struct mdesc_handle *hp, u64 mp,
+			  int core_id)
+{
+	find_back_node_value(hp, mp, "cpu", __mark_core_id, core_id, 10);
+}
+
+static void mark_sock_ids(struct mdesc_handle *hp, u64 mp,
+			  int sock_id)
+{
+	find_back_node_value(hp, mp, "cpu", __mark_sock_id, sock_id, 10);
+}
+
 static void set_core_ids(struct mdesc_handle *hp)
 {
 	int idx;
 	u64 mp;
 
 	idx = 1;
+
+	/* Identify unique cores by looking for cpus backpointed to by
+	 * level 1 instruction caches.
+	 */
 	mdesc_for_each_node_by_name(hp, mp, "cache") {
 		const u64 *level;
 		const char *type;
@@ -667,11 +690,72 @@ static void set_core_ids(struct mdesc_handle *hp)
 			continue;
 
 		mark_core_ids(hp, mp, idx);
+		idx++;
+	}
+}
+
+static int set_sock_ids_by_cache(struct mdesc_handle *hp, int level)
+{
+	u64 mp;
+	int idx = 1;
+	int fnd = 0;
+
+	/* Identify unique sockets by looking for cpus backpointed to by
+	 * shared level n caches.
+	 */
+	mdesc_for_each_node_by_name(hp, mp, "cache") {
+		const u64 *cur_lvl;
+
+		cur_lvl = mdesc_get_property(hp, mp, "level", NULL);
+		if (*cur_lvl != level)
+			continue;
+
+		mark_sock_ids(hp, mp, idx);
+		idx++;
+		fnd = 1;
+	}
+	return fnd;
+}
+
+static void set_sock_ids_by_socket(struct mdesc_handle *hp, u64 mp)
+{
+	int idx = 1;
 
+	mdesc_for_each_node_by_name(hp, mp, "socket") {
+		u64 a;
+
+		mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) {
+			u64 t = mdesc_arc_target(hp, a);
+			const char *name;
+			const u64 *id;
+
+			name = mdesc_node_name(hp, t);
+			if (strcmp(name, "cpu"))
+				continue;
+
+			id = mdesc_get_property(hp, t, "id", NULL);
+			if (*id < num_possible_cpus())
+				cpu_data(*id).sock_id = idx;
+		}
 		idx++;
 	}
 }
 
+static void set_sock_ids(struct mdesc_handle *hp)
+{
+	u64 mp;
+
+	/* If machine description exposes sockets data use it.
+	 * Otherwise fallback to use shared L3 or L2 caches.
+	 */
+	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "sockets");
+	if (mp != MDESC_NODE_NULL)
+		return set_sock_ids_by_socket(hp, mp);
+
+	if (!set_sock_ids_by_cache(hp, 3))
+		set_sock_ids_by_cache(hp, 2);
+}
+
 static void mark_proc_ids(struct mdesc_handle *hp, u64 mp, int proc_id)
 {
 	u64 a;
@@ -707,7 +791,6 @@ static void __set_proc_ids(struct mdesc_handle *hp, const char *exec_unit_name)
 			continue;
 
 		mark_proc_ids(hp, mp, idx);
-
 		idx++;
 	}
 }
@@ -900,6 +983,7 @@ void mdesc_fill_in_cpu_data(cpumask_t *mask)
 
 	set_core_ids(hp);
 	set_proc_ids(hp);
+	set_sock_ids(hp);
 
 	mdesc_release(hp);
 
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 6f7251fd2eab..c928bc64b4ba 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -1002,6 +1002,38 @@ static int __init pcibios_init(void)
 subsys_initcall(pcibios_init);
 
 #ifdef CONFIG_SYSFS
+
+#define SLOT_NAME_SIZE  11  /* Max decimal digits + null in u32 */
+
+static void pcie_bus_slot_names(struct pci_bus *pbus)
+{
+	struct pci_dev *pdev;
+	struct pci_bus *bus;
+
+	list_for_each_entry(pdev, &pbus->devices, bus_list) {
+		char name[SLOT_NAME_SIZE];
+		struct pci_slot *pci_slot;
+		const u32 *slot_num;
+		int len;
+
+		slot_num = of_get_property(pdev->dev.of_node,
+					   "physical-slot#", &len);
+
+		if (slot_num == NULL || len != 4)
+			continue;
+
+		snprintf(name, sizeof(name), "%u", slot_num[0]);
+		pci_slot = pci_create_slot(pbus, slot_num[0], name, NULL);
+
+		if (IS_ERR(pci_slot))
+			pr_err("PCI: pci_create_slot returned %ld.\n",
+			       PTR_ERR(pci_slot));
+	}
+
+	list_for_each_entry(bus, &pbus->children, node)
+		pcie_bus_slot_names(bus);
+}
+
 static void pci_bus_slot_names(struct device_node *node, struct pci_bus *bus)
 {
 	const struct pci_slot_names {
@@ -1053,18 +1085,29 @@ static int __init of_pci_slot_init(void)
 
 	while ((pbus = pci_find_next_bus(pbus)) != NULL) {
 		struct device_node *node;
+		struct pci_dev *pdev;
+
+		pdev = list_first_entry(&pbus->devices, struct pci_dev,
+					bus_list);
 
-		if (pbus->self) {
-			/* PCI->PCI bridge */
-			node = pbus->self->dev.of_node;
+		if (pdev && pci_is_pcie(pdev)) {
+			pcie_bus_slot_names(pbus);
 		} else {
-			struct pci_pbm_info *pbm = pbus->sysdata;
 
-			/* Host PCI controller */
-			node = pbm->op->dev.of_node;
-		}
+			if (pbus->self) {
+
+				/* PCI->PCI bridge */
+				node = pbus->self->dev.of_node;
+
+			} else {
+				struct pci_pbm_info *pbm = pbus->sysdata;
 
-		pci_bus_slot_names(node, pbus);
+				/* Host PCI controller */
+				node = pbm->op->dev.of_node;
+			}
+
+			pci_bus_slot_names(node, pbus);
+		}
 	}
 
 	return 0;
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index c38d19fc27ba..f7b261749383 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -255,6 +255,24 @@ void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *start,
 	}
 }
 
+void sun_m7_patch_2insn_range(struct sun4v_2insn_patch_entry *start,
+			     struct sun4v_2insn_patch_entry *end)
+{
+	while (start < end) {
+		unsigned long addr = start->addr;
+
+		*(unsigned int *) (addr +  0) = start->insns[0];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		*(unsigned int *) (addr +  4) = start->insns[1];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  4));
+
+		start++;
+	}
+}
+
 static void __init sun4v_patch(void)
 {
 	extern void sun4v_hvapi_init(void);
@@ -267,6 +285,9 @@ static void __init sun4v_patch(void)
 
 	sun4v_patch_2insn_range(&__sun4v_2insn_patch,
 				&__sun4v_2insn_patch_end);
+	if (sun4v_chip_type == SUN4V_CHIP_SPARC_M7)
+		sun_m7_patch_2insn_range(&__sun_m7_2insn_patch,
+					 &__sun_m7_2insn_patch_end);
 
 	sun4v_hvapi_init();
 }
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 61139d9924ca..19cd08d18672 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -60,8 +60,12 @@ DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
 	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 
+cpumask_t cpu_core_sib_map[NR_CPUS] __read_mostly = {
+	[0 ... NR_CPUS-1] = CPU_MASK_NONE };
+
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_SYMBOL(cpu_core_map);
+EXPORT_SYMBOL(cpu_core_sib_map);
 
 static cpumask_t smp_commenced_mask;
 
@@ -1243,6 +1247,15 @@ void smp_fill_in_sib_core_maps(void)
 		}
 	}
 
+	for_each_present_cpu(i)  {
+		unsigned int j;
+
+		for_each_present_cpu(j)  {
+			if (cpu_data(i).sock_id == cpu_data(j).sock_id)
+				cpumask_set_cpu(j, &cpu_core_sib_map[i]);
+		}
+	}
+
 	for_each_present_cpu(i) {
 		unsigned int j;
 
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 09243057cb0b..f1a2f688b28a 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -138,6 +138,11 @@ SECTIONS
 		*(.pause_3insn_patch)
 		__pause_3insn_patch_end = .;
 	}
+	.sun_m7_2insn_patch : {
+		__sun_m7_2insn_patch = .;
+		*(.sun_m7_2insn_patch)
+		__sun_m7_2insn_patch_end = .;
+	}
 	PERCPU_SECTION(SMP_CACHE_BYTES)
 
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 70d817154fe8..c399e7b3b035 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -21,6 +21,7 @@
 #include <linux/perf_event.h>
 #include <linux/interrupt.h>
 #include <linux/kdebug.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -29,7 +30,6 @@
 #include <asm/setup.h>
 #include <asm/smp.h>
 #include <asm/traps.h>
-#include <asm/uaccess.h>
 
 #include "mm_32.h"
 
@@ -196,7 +196,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (pagefault_disabled() || !mm)
 		goto no_context;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 479823249429..e9268ea1a68d 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -22,12 +22,12 @@
 #include <linux/kdebug.h>
 #include <linux/percpu.h>
 #include <linux/context_tracking.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
-#include <asm/uaccess.h>
 #include <asm/asi.h>
 #include <asm/lsu.h>
 #include <asm/sections.h>
@@ -330,7 +330,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto intr_or_no_mm;
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c
index 449f864f0cef..a454ec5ff07a 100644
--- a/arch/sparc/mm/highmem.c
+++ b/arch/sparc/mm/highmem.c
@@ -53,7 +53,7 @@ void *kmap_atomic(struct page *page)
 	unsigned long vaddr;
 	long idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -91,6 +91,7 @@ void __kunmap_atomic(void *kvaddr)
 
 	if (vaddr < FIXADDR_START) { // FIXME
 		pagefault_enable();
+		preempt_enable();
 		return;
 	}
 
@@ -126,5 +127,6 @@ void __kunmap_atomic(void *kvaddr)
 
 	kmap_atomic_idx_pop();
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 4ca0d6ba5ec8..c5d08b89a96c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -54,6 +54,7 @@
 #include "init_64.h"
 
 unsigned long kern_linear_pte_xor[4] __read_mostly;
+static unsigned long page_cache4v_flag;
 
 /* A bitmap, two bits for every 256MB of physical memory.  These two
  * bits determine what page size we use for kernel linear
@@ -1909,11 +1910,24 @@ static void __init sun4u_linear_pte_xor_finalize(void)
 
 static void __init sun4v_linear_pte_xor_finalize(void)
 {
+	unsigned long pagecv_flag;
+
+	/* Bit 9 of TTE is no longer CV bit on M7 processor and it instead
+	 * enables MCD error. Do not set bit 9 on M7 processor.
+	 */
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_SPARC_M7:
+		pagecv_flag = 0x00;
+		break;
+	default:
+		pagecv_flag = _PAGE_CV_4V;
+		break;
+	}
 #ifndef CONFIG_DEBUG_PAGEALLOC
 	if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
 		kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
 			PAGE_OFFSET;
-		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | pagecv_flag |
 					   _PAGE_P_4V | _PAGE_W_4V);
 	} else {
 		kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
@@ -1922,7 +1936,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 	if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
 		kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
 			PAGE_OFFSET;
-		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | pagecv_flag |
 					   _PAGE_P_4V | _PAGE_W_4V);
 	} else {
 		kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
@@ -1931,7 +1945,7 @@ static void __init sun4v_linear_pte_xor_finalize(void)
 	if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
 		kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
 			PAGE_OFFSET;
-		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | pagecv_flag |
 					   _PAGE_P_4V | _PAGE_W_4V);
 	} else {
 		kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
@@ -1958,6 +1972,13 @@ static phys_addr_t __init available_memory(void)
 	return available;
 }
 
+#define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
+#define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
+#define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
+#define __DIRTY_BITS_4V	 (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
+#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
+#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
+
 /* We need to exclude reserved regions. This exclusion will include
  * vmlinux and initrd. To be more precise the initrd size could be used to
  * compute a new lower limit because it is freed later during initialization.
@@ -2034,6 +2055,25 @@ void __init paging_init(void)
 	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
 #endif
 
+	/* TTE.cv bit on sparc v9 occupies the same position as TTE.mcde
+	 * bit on M7 processor. This is a conflicting usage of the same
+	 * bit. Enabling TTE.cv on M7 would turn on Memory Corruption
+	 * Detection error on all pages and this will lead to problems
+	 * later. Kernel does not run with MCD enabled and hence rest
+	 * of the required steps to fully configure memory corruption
+	 * detection are not taken. We need to ensure TTE.mcde is not
+	 * set on M7 processor. Compute the value of cacheability
+	 * flag for use later taking this into consideration.
+	 */
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_SPARC_M7:
+		page_cache4v_flag = _PAGE_CP_4V;
+		break;
+	default:
+		page_cache4v_flag = _PAGE_CACHE_4V;
+		break;
+	}
+
 	if (tlb_type == hypervisor)
 		sun4v_pgprot_init();
 	else
@@ -2274,13 +2314,6 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 }
 #endif
 
-#define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
-#define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
-#define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
-#define __DIRTY_BITS_4V	 (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
-#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
-#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
-
 pgprot_t PAGE_KERNEL __read_mostly;
 EXPORT_SYMBOL(PAGE_KERNEL);
 
@@ -2312,8 +2345,7 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
 		    _PAGE_P_4U | _PAGE_W_4U);
 	if (tlb_type == hypervisor)
 		pte_base = (_PAGE_VALID | _PAGE_SZ4MB_4V |
-			    _PAGE_CP_4V | _PAGE_CV_4V |
-			    _PAGE_P_4V | _PAGE_W_4V);
+			    page_cache4v_flag | _PAGE_P_4V | _PAGE_W_4V);
 
 	pte_base |= _PAGE_PMD_HUGE;
 
@@ -2450,14 +2482,14 @@ static void __init sun4v_pgprot_init(void)
 	int i;
 
 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
-				_PAGE_CACHE_4V | _PAGE_P_4V |
+				page_cache4v_flag | _PAGE_P_4V |
 				__ACCESS_BITS_4V | __DIRTY_BITS_4V |
 				_PAGE_EXEC_4V);
 	PAGE_KERNEL_LOCKED = PAGE_KERNEL;
 
 	_PAGE_IE = _PAGE_IE_4V;
 	_PAGE_E = _PAGE_E_4V;
-	_PAGE_CACHE = _PAGE_CACHE_4V;
+	_PAGE_CACHE = page_cache4v_flag;
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	kern_linear_pte_xor[0] = _PAGE_VALID ^ PAGE_OFFSET;
@@ -2465,8 +2497,8 @@ static void __init sun4v_pgprot_init(void)
 	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
 		PAGE_OFFSET;
 #endif
-	kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
-				   _PAGE_P_4V | _PAGE_W_4V);
+	kern_linear_pte_xor[0] |= (page_cache4v_flag | _PAGE_P_4V |
+				   _PAGE_W_4V);
 
 	for (i = 1; i < 4; i++)
 		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
@@ -2479,12 +2511,12 @@ static void __init sun4v_pgprot_init(void)
 			     _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
 			     _PAGE_SZ64K_4V | _PAGE_SZ8K_4V);
 
-	page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V;
-	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+	page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | page_cache4v_flag;
+	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
 		       __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V);
-	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
 		       __ACCESS_BITS_4V | _PAGE_EXEC_4V);
-	page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+	page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | page_cache4v_flag |
 			 __ACCESS_BITS_4V | _PAGE_EXEC_4V);
 
 	page_exec_bit = _PAGE_EXEC_4V;
@@ -2542,7 +2574,7 @@ static unsigned long kern_large_tte(unsigned long paddr)
 	       _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U);
 	if (tlb_type == hypervisor)
 		val = (_PAGE_VALID | _PAGE_SZ4MB_4V |
-		       _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V |
+		       page_cache4v_flag | _PAGE_P_4V |
 		       _PAGE_EXEC_4V | _PAGE_W_4V);
 
 	return val | paddr;
@@ -2706,7 +2738,7 @@ void hugetlb_setup(struct pt_regs *regs)
 	struct mm_struct *mm = current->mm;
 	struct tsb_config *tp;
 
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		const struct exception_table_entry *entry;
 
 		entry = search_exception_tables(regs->tpc);
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index a07e31b50d3f..59cf0b911898 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -28,6 +28,7 @@ config TILE
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_CONTEXT_TRACKING
+	select EDAC_SUPPORT
 
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h
index 7b11c5fadd42..0496970cef82 100644
--- a/arch/tile/include/asm/atomic_64.h
+++ b/arch/tile/include/asm/atomic_64.h
@@ -105,9 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u)
 
 #define atomic64_inc_not_zero(v)	atomic64_add_unless((v), 1, 0)
 
-/* Define this to indicate that cmpxchg is an efficient operation. */
-#define __HAVE_ARCH_CMPXCHG
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_TILE_ATOMIC_64_H */
diff --git a/arch/tile/include/asm/edac.h b/arch/tile/include/asm/edac.h
deleted file mode 100644
index 87fc83eeaffd..000000000000
--- a/arch/tile/include/asm/edac.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright 2011 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- */
-
-#ifndef _ASM_TILE_EDAC_H
-#define _ASM_TILE_EDAC_H
-
-/* ECC atomic, DMA, SMP and interrupt safe scrub function */
-
-static inline void atomic_scrub(void *va, u32 size)
-{
-	/*
-	 * These is nothing to be done here because CE is
-	 * corrected by the mshim.
-	 */
-	return;
-}
-
-#endif /* _ASM_TILE_EDAC_H */
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 6ef4ecab1df2..dc61de15c1f9 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -54,7 +54,7 @@ extern void iounmap(volatile void __iomem *addr);
 
 #define ioremap_nocache(physaddr, size)		ioremap(physaddr, size)
 #define ioremap_wc(physaddr, size)		ioremap(physaddr, size)
-#define ioremap_writethrough(physaddr, size)	ioremap(physaddr, size)
+#define ioremap_wt(physaddr, size)		ioremap(physaddr, size)
 #define ioremap_fullcache(physaddr, size)	ioremap(physaddr, size)
 
 #define mmiowb()
diff --git a/arch/tile/include/asm/topology.h b/arch/tile/include/asm/topology.h
index 938311844233..76b0d0ebb244 100644
--- a/arch/tile/include/asm/topology.h
+++ b/arch/tile/include/asm/topology.h
@@ -55,7 +55,7 @@ static inline const struct cpumask *cpumask_of_node(int node)
 #define topology_physical_package_id(cpu)       ((void)(cpu), 0)
 #define topology_core_id(cpu)                   (cpu)
 #define topology_core_cpumask(cpu)              ((void)(cpu), cpu_online_mask)
-#define topology_thread_cpumask(cpu)            cpumask_of(cpu)
+#define topology_sibling_cpumask(cpu)           cpumask_of(cpu)
 #endif
 
 #endif /* _ASM_TILE_TOPOLOGY_H */
diff --git a/arch/tile/include/asm/uaccess.h b/arch/tile/include/asm/uaccess.h
index f41cb53cf645..a33276bf5ca1 100644
--- a/arch/tile/include/asm/uaccess.h
+++ b/arch/tile/include/asm/uaccess.h
@@ -78,7 +78,8 @@ int __range_ok(unsigned long addr, unsigned long size);
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -192,7 +193,8 @@ extern int __get_user_bad(void)
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -274,7 +276,8 @@ extern int __put_user_bad(void)
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -330,7 +333,8 @@ extern int __put_user_bad(void)
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -366,7 +370,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -437,7 +442,8 @@ static inline unsigned long __must_check copy_from_user(void *to,
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to user space.  Caller must check
  * the specified blocks with access_ok() before calling this function.
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index e83cc999da02..3f4f58d34a92 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -354,9 +354,9 @@ static int handle_page_fault(struct pt_regs *regs,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running in an
-	 * atomic region then we must not take the fault.
+	 * region with pagefaults disabled then we must not take the fault.
 	 */
-	if (in_atomic() || !mm) {
+	if (pagefault_disabled() || !mm) {
 		vma = NULL;  /* happy compiler */
 		goto bad_area_nosemaphore;
 	}
diff --git a/arch/tile/mm/highmem.c b/arch/tile/mm/highmem.c
index 6aa2f2625447..fcd545014e79 100644
--- a/arch/tile/mm/highmem.c
+++ b/arch/tile/mm/highmem.c
@@ -201,7 +201,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	int idx, type;
 	pte_t *pte;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	/* Avoid icache flushes by disallowing atomic executable mappings. */
@@ -259,6 +259,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 8e4daf44e980..47ff9b7f3e5d 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/hardirq.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/current.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -35,10 +36,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	*code_out = SEGV_MAPERR;
 
 	/*
-	 * If the fault was during atomic operation, don't take the fault, just
+	 * If the fault was with pagefaults disabled, don't take the fault, just
 	 * fail.
 	 */
-	if (in_atomic())
+	if (faulthandler_disabled())
 		goto out_nosemaphore;
 
 	if (is_user)
diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h
index 654407e98619..38b3f3785c3c 100644
--- a/arch/unicore32/include/asm/pci.h
+++ b/arch/unicore32/include/asm/pci.h
@@ -18,16 +18,6 @@
 #include <asm-generic/pci.h>
 #include <mach/hardware.h> /* for PCIBIOS_MIN_* */
 
-#ifdef CONFIG_PCI
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
-#endif
-
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	enum pci_mmap_state mmap_state, int write_combine);
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 0dc922dba915..afccef5529cc 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -218,7 +218,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	 * If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm)
+	if (faulthandler_disabled() || !mm)
 		goto no_context;
 
 	if (user_mode(regs))
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 3942f74c92d7..1538562cc720 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,3 +1,6 @@
+
+obj-y += entry/
+
 obj-$(CONFIG_KVM) += kvm/
 
 # Xen paravirtualization support
@@ -11,7 +14,7 @@ obj-y += kernel/
 obj-y += mm/
 
 obj-y += crypto/
-obj-y += vdso/
+
 obj-$(CONFIG_IA32_EMULATION) += ia32/
 
 obj-y += platform/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 226d5696e1d1..8e0b76ad8350 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -9,140 +9,143 @@ config 64BIT
 config X86_32
 	def_bool y
 	depends on !64BIT
-	select CLKSRC_I8253
-	select HAVE_UID16
 
 config X86_64
 	def_bool y
 	depends on 64BIT
-	select X86_DEV_DMA_OPS
-	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
 	def_bool y
-	select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
-	select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+	select ACPI_LEGACY_TABLES_LOOKUP	if ACPI
+	select ACPI_SYSTEM_POWER_STATES_SUPPORT	if ACPI
+	select ANON_INODES
+	select ARCH_CLOCKSOURCE_DATA
+	select ARCH_DISCARD_MEMBLOCK
+	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_SG_CHAIN
+	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select ARCH_MIGHT_HAVE_ACPI_PDC		if ACPI
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
-	select HAVE_AOUT if X86_32
-	select HAVE_UNSTABLE_SCHED_CLOCK
-	select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
-	select ARCH_SUPPORTS_INT128 if X86_64
-	select HAVE_IDE
-	select HAVE_OPROFILE
-	select HAVE_PCSPKR_PLATFORM
-	select HAVE_PERF_EVENTS
-	select HAVE_IOREMAP_PROT
-	select HAVE_KPROBES
-	select HAVE_MEMBLOCK
-	select HAVE_MEMBLOCK_NODE_MAP
-	select ARCH_DISCARD_MEMBLOCK
-	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select ARCH_SUPPORTS_ATOMIC_RMW
+	select ARCH_SUPPORTS_INT128		if X86_64
+	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
+	select ARCH_USE_BUILTIN_BSWAP
+	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 	select ARCH_WANT_FRAME_POINTERS
-	select HAVE_DMA_ATTRS
-	select HAVE_DMA_CONTIGUOUS
-	select HAVE_KRETPROBES
+	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
+	select ARCH_WANT_OPTIONAL_GPIOLIB
+	select BUILDTIME_EXTABLE_SORT
+	select CLKEVT_I8253
+	select CLKSRC_I8253			if X86_32
+	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
+	select CLOCKSOURCE_WATCHDOG
+	select CLONE_BACKWARDS			if X86_32
+	select COMPAT_OLD_SIGACTION		if IA32_EMULATION
+	select DCACHE_WORD_ACCESS
+	select EDAC_ATOMIC_SCRUB
+	select EDAC_SUPPORT
+	select GENERIC_CLOCKEVENTS
+	select GENERIC_CLOCKEVENTS_BROADCAST	if X86_64 || (X86_32 && X86_LOCAL_APIC)
+	select GENERIC_CLOCKEVENTS_MIN_ADJUST
+	select GENERIC_CMOS_UPDATE
+	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_EARLY_IOREMAP
-	select HAVE_OPTPROBES
-	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_FTRACE_MCOUNT_RECORD
-	select HAVE_FENTRY if X86_64
+	select GENERIC_FIND_FIRST_BIT
+	select GENERIC_IOMAP
+	select GENERIC_IRQ_PROBE
+	select GENERIC_IRQ_SHOW
+	select GENERIC_PENDING_IRQ		if SMP
+	select GENERIC_SMP_IDLE_THREAD
+	select GENERIC_STRNCPY_FROM_USER
+	select GENERIC_STRNLEN_USER
+	select GENERIC_TIME_VSYSCALL
+	select HAVE_ACPI_APEI			if ACPI
+	select HAVE_ACPI_APEI_NMI		if ACPI
+	select HAVE_ALIGNED_STRUCT_PAGE		if SLUB
+	select HAVE_AOUT			if X86_32
+	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_ARCH_HUGE_VMAP		if X86_64 || X86_PAE
+	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
+	select HAVE_ARCH_KGDB
+	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY		if X86_64
+	select HAVE_ARCH_TRACEHOOK
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_BPF_JIT			if X86_64
+	select HAVE_CC_STACKPROTECTOR
+	select HAVE_CMPXCHG_DOUBLE
+	select HAVE_CMPXCHG_LOCAL
+	select HAVE_CONTEXT_TRACKING		if X86_64
 	select HAVE_C_RECORDMCOUNT
+	select HAVE_DEBUG_KMEMLEAK
+	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_DMA_API_DEBUG
+	select HAVE_DMA_ATTRS
+	select HAVE_DMA_CONTIGUOUS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
-	select HAVE_FUNCTION_TRACER
-	select HAVE_FUNCTION_GRAPH_TRACER
-	select HAVE_FUNCTION_GRAPH_FP_TEST
-	select HAVE_SYSCALL_TRACEPOINTS
-	select SYSCTL_EXCEPTION_TRACE
-	select HAVE_KVM
-	select HAVE_ARCH_KGDB
-	select HAVE_ARCH_TRACEHOOK
-	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select USER_STACKTRACE_SUPPORT
-	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_DMA_API_DEBUG
-	select HAVE_KERNEL_GZIP
+	select HAVE_FENTRY			if X86_64
+	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_FUNCTION_GRAPH_FP_TEST
+	select HAVE_FUNCTION_GRAPH_TRACER
+	select HAVE_FUNCTION_TRACER
+	select HAVE_GENERIC_DMA_COHERENT	if X86_32
+	select HAVE_HW_BREAKPOINT
+	select HAVE_IDE
+	select HAVE_IOREMAP_PROT
+	select HAVE_IRQ_EXIT_ON_IRQ_STACK	if X86_64
+	select HAVE_IRQ_TIME_ACCOUNTING
 	select HAVE_KERNEL_BZIP2
+	select HAVE_KERNEL_GZIP
+	select HAVE_KERNEL_LZ4
 	select HAVE_KERNEL_LZMA
-	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_LZO
-	select HAVE_KERNEL_LZ4
-	select HAVE_HW_BREAKPOINT
+	select HAVE_KERNEL_XZ
+	select HAVE_KPROBES
+	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KRETPROBES
+	select HAVE_KVM
+	select HAVE_LIVEPATCH			if X86_64
+	select HAVE_MEMBLOCK
+	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
-	select PERF_EVENTS
+	select HAVE_OPROFILE
+	select HAVE_OPTPROBES
+	select HAVE_PCSPKR_PLATFORM
+	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
-	select HAVE_DEBUG_KMEMLEAK
-	select ANON_INODES
-	select HAVE_ALIGNED_STRUCT_PAGE if SLUB
-	select HAVE_CMPXCHG_LOCAL
-	select HAVE_CMPXCHG_DOUBLE
-	select HAVE_ARCH_KMEMCHECK
-	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+	select HAVE_REGS_AND_STACK_ACCESS_API
+	select HAVE_SYSCALL_TRACEPOINTS
+	select HAVE_UID16			if X86_32
+	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
-	select ARCH_HAS_ELF_RANDOMIZE
-	select HAVE_ARCH_JUMP_LABEL
-	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
-	select SPARSE_IRQ
-	select GENERIC_FIND_FIRST_BIT
-	select GENERIC_IRQ_PROBE
-	select GENERIC_PENDING_IRQ if SMP
-	select GENERIC_IRQ_SHOW
-	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select IRQ_FORCED_THREADING
-	select HAVE_BPF_JIT if X86_64
-	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-	select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
-	select ARCH_HAS_SG_CHAIN
-	select CLKEVT_I8253
-	select ARCH_HAVE_NMI_SAFE_CMPXCHG
-	select GENERIC_IOMAP
-	select DCACHE_WORD_ACCESS
-	select GENERIC_SMP_IDLE_THREAD
-	select ARCH_WANT_IPC_PARSE_VERSION if X86_32
-	select HAVE_ARCH_SECCOMP_FILTER
-	select BUILDTIME_EXTABLE_SORT
-	select GENERIC_CMOS_UPDATE
-	select HAVE_ARCH_SOFT_DIRTY if X86_64
-	select CLOCKSOURCE_WATCHDOG
-	select GENERIC_CLOCKEVENTS
-	select ARCH_CLOCKSOURCE_DATA
-	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
-	select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
-	select GENERIC_TIME_VSYSCALL
-	select GENERIC_STRNCPY_FROM_USER
-	select GENERIC_STRNLEN_USER
-	select HAVE_CONTEXT_TRACKING if X86_64
-	select HAVE_IRQ_TIME_ACCOUNTING
-	select VIRT_TO_BUS
-	select MODULES_USE_ELF_REL if X86_32
-	select MODULES_USE_ELF_RELA if X86_64
-	select CLONE_BACKWARDS if X86_32
-	select ARCH_USE_BUILTIN_BSWAP
-	select ARCH_USE_QUEUE_RWLOCK
-	select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
-	select OLD_SIGACTION if X86_32
-	select COMPAT_OLD_SIGACTION if IA32_EMULATION
+	select MODULES_USE_ELF_RELA		if X86_64
+	select MODULES_USE_ELF_REL		if X86_32
+	select OLD_SIGACTION			if X86_32
+	select OLD_SIGSUSPEND3			if X86_32 || IA32_EMULATION
+	select PERF_EVENTS
 	select RTC_LIB
-	select HAVE_DEBUG_STACKOVERFLOW
-	select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
-	select HAVE_CC_STACKPROTECTOR
-	select GENERIC_CPU_AUTOPROBE
-	select HAVE_ARCH_AUDITSYSCALL
-	select ARCH_SUPPORTS_ATOMIC_RMW
-	select HAVE_ACPI_APEI if ACPI
-	select HAVE_ACPI_APEI_NMI if ACPI
-	select ACPI_LEGACY_TABLES_LOOKUP if ACPI
-	select X86_FEATURE_NAMES if PROC_FS
+	select SPARSE_IRQ
 	select SRCU
+	select SYSCTL_EXCEPTION_TRACE
+	select USER_STACKTRACE_SUPPORT
+	select VIRT_TO_BUS
+	select X86_DEV_DMA_OPS			if X86_64
+	select X86_FEATURE_NAMES		if PROC_FS
 
 config INSTRUCTION_DECODER
 	def_bool y
@@ -260,10 +263,6 @@ config X86_64_SMP
 	def_bool y
 	depends on X86_64 && SMP
 
-config X86_HT
-	def_bool y
-	depends on SMP
-
 config X86_32_LAZY_GS
 	def_bool y
 	depends on X86_32 && !CC_STACKPROTECTOR
@@ -341,7 +340,7 @@ config X86_FEATURE_NAMES
 
 config X86_X2APIC
 	bool "Support x2apic"
-	depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
+	depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
 	---help---
 	  This enables x2apic support on CPUs that have this feature.
 
@@ -441,6 +440,7 @@ config X86_UV
 	depends on X86_EXTENDED_PLATFORM
 	depends on NUMA
 	depends on X86_X2APIC
+	depends on PCI
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  If you don't have one of these, you should say N here.
@@ -466,7 +466,6 @@ config X86_INTEL_CE
 	select X86_REBOOTFIXUPS
 	select OF
 	select OF_EARLY_FLATTREE
-	select IRQ_DOMAIN
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
@@ -666,7 +665,7 @@ config PARAVIRT_DEBUG
 config PARAVIRT_SPINLOCKS
 	bool "Paravirtualization layer for spinlocks"
 	depends on PARAVIRT && SMP
-	select UNINLINE_SPIN_UNLOCK
+	select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
 	---help---
 	  Paravirtualized spinlocks allow a pvops backend to replace the
 	  spinlock implementation with something virtualization-friendly
@@ -851,11 +850,12 @@ config NR_CPUS
 	default "1" if !SMP
 	default "8192" if MAXSMP
 	default "32" if SMP && X86_BIGSMP
-	default "8" if SMP
+	default "8" if SMP && X86_32
+	default "64" if SMP
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
-	  supported value is 4096, otherwise the maximum value is 512.  The
+	  supported value is 8192, otherwise the maximum value is 512.  The
 	  minimum value which makes sense is 2.
 
 	  This is purely to save memory - each supported CPU adds
@@ -863,7 +863,7 @@ config NR_CPUS
 
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
-	depends on X86_HT
+	depends on SMP
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
 	  when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -873,7 +873,7 @@ config SCHED_SMT
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
-	depends on X86_HT
+	depends on SMP
 	---help---
 	  Multi-core scheduler support improves the CPU scheduler's decision
 	  making when dealing with multi-core CPU chips at a cost of slightly
@@ -914,12 +914,12 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
-	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+	select IRQ_DOMAIN_HIERARCHY
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
 config X86_IO_APIC
 	def_bool y
 	depends on X86_LOCAL_APIC || X86_UP_IOAPIC
-	select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 72484a645f05..a15893d17c55 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -332,4 +332,27 @@ config X86_DEBUG_STATIC_CPU_HAS
 
 	  If unsure, say N.
 
+config X86_DEBUG_FPU
+	bool "Debug the x86 FPU code"
+	depends on DEBUG_KERNEL
+	default y
+	---help---
+	  If this option is enabled then there will be extra sanity
+	  checks and (boot time) debug printouts added to the kernel.
+	  This debugging adds some small amount of runtime overhead
+	  to the kernel.
+
+	  If unsure, say N.
+
+config PUNIT_ATOM_DEBUG
+	tristate "ATOM Punit debug driver"
+	select DEBUG_FS
+	select IOSF_MBI
+	---help---
+	  This is a debug driver, which gets the power states
+	  of all Punit North Complex devices. The power states of
+	  each device is exposed as part of the debugfs interface.
+	  The current power state can be read from
+	  /sys/kernel/debug/punit_atom/dev_power_state
+
 endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2fda005bb334..118e6debc483 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -77,6 +77,12 @@ else
         KBUILD_AFLAGS += -m64
         KBUILD_CFLAGS += -m64
 
+        # Align jump targets to 1 byte, not the default 16 bytes:
+        KBUILD_CFLAGS += -falign-jumps=1
+
+        # Pack loops tightly as well:
+        KBUILD_CFLAGS += -falign-loops=1
+
         # Don't autogenerate traditional x87 instructions
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)
         KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
@@ -84,6 +90,9 @@ else
 	# Use -mpreferred-stack-boundary=3 if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
 
+	# Use -mskip-rax-setup if supported.
+	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
@@ -140,12 +149,6 @@ endif
 sp-$(CONFIG_X86_32) := esp
 sp-$(CONFIG_X86_64) := rsp
 
-# do binutils support CFI?
-cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
-# is .cfi_signal_frame supported too?
-cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-
 # does binutils support specific instructions?
 asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
 asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
@@ -153,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
 avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
 avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
 
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 
@@ -178,7 +181,7 @@ archscripts: scripts_basic
 # Syscall table generation
 
 archheaders:
-	$(Q)$(MAKE) $(build)=arch/x86/syscalls all
+	$(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
 
 archprepare:
 ifeq ($(CONFIG_KEXEC_FILE),y)
@@ -241,7 +244,7 @@ install:
 
 PHONY += vdso_install
 vdso_install:
-	$(Q)$(MAKE) $(build)=arch/x86/vdso $@
+	$(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
 
 archclean:
 	$(Q)rm -rf $(objtree)/arch/i386
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 89dd0d78013a..805d25ca5f1d 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -2,15 +2,14 @@
 #define BOOT_COMPRESSED_MISC_H
 
 /*
- * we have to be careful, because no indirections are allowed here, and
- * paravirt_ops is a kind of one. As it will only run in baremetal anyway,
- * we just keep it from happening
+ * Special hack: we have to be careful, because no indirections are allowed here,
+ * and paravirt_ops is a kind of one. As it will only run in baremetal anyway,
+ * we just keep it from happening. (This list needs to be extended when new
+ * paravirt and debugging variants are added.)
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_PARAVIRT_SPINLOCKS
 #undef CONFIG_KASAN
-#ifdef CONFIG_X86_32
-#define _ASM_X86_DESC_H 1
-#endif
 
 #include <linux/linkage.h>
 #include <linux/screen_info.h>
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 112cefacf2af..2bfc8a7c88c1 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -32,7 +32,7 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/aes.h>
 #include <crypto/ablk_helper.h>
 #include <crypto/scatterwalk.h>
@@ -44,15 +44,19 @@
 #endif
 
 
+#define AESNI_ALIGN	16
+#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE - 1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
+
 /* This data is stored at the end of the crypto_tfm struct.
  * It's a type of per "session" data storage location.
  * This needs to be 16 byte aligned.
  */
 struct aesni_rfc4106_gcm_ctx {
-	u8 hash_subkey[16];
-	struct crypto_aes_ctx aes_key_expanded;
+	u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+	struct crypto_aes_ctx aes_key_expanded
+		__attribute__ ((__aligned__(AESNI_ALIGN)));
 	u8 nonce[4];
-	struct cryptd_aead *cryptd_tfm;
 };
 
 struct aesni_gcm_set_hash_subkey_result {
@@ -66,10 +70,6 @@ struct aesni_hash_subkey_req_data {
 	struct scatterlist sg;
 };
 
-#define AESNI_ALIGN	(16)
-#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
-#define RFC4106_HASH_SUBKEY_SIZE 16
-
 struct aesni_lrw_ctx {
 	struct lrw_table_ctx lrw_table;
 	u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
@@ -283,10 +283,11 @@ static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out,
 static inline struct
 aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
 {
-	return
-		(struct aesni_rfc4106_gcm_ctx *)
-		PTR_ALIGN((u8 *)
-		crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+	unsigned long align = AESNI_ALIGN;
+
+	if (align <= crypto_tfm_ctx_alignment())
+		align = 1;
+	return PTR_ALIGN(crypto_aead_ctx(tfm), align);
 }
 #endif
 
@@ -790,36 +791,30 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
 #endif
 
 #ifdef CONFIG_X86_64
-static int rfc4106_init(struct crypto_tfm *tfm)
+static int rfc4106_init(struct crypto_aead *aead)
 {
 	struct cryptd_aead *cryptd_tfm;
-	struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
-		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
-	struct crypto_aead *cryptd_child;
-	struct aesni_rfc4106_gcm_ctx *child_ctx;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
 	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
 				       CRYPTO_ALG_INTERNAL,
 				       CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 
-	cryptd_child = cryptd_aead_child(cryptd_tfm);
-	child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child);
-	memcpy(child_ctx, ctx, sizeof(*ctx));
-	ctx->cryptd_tfm = cryptd_tfm;
-	tfm->crt_aead.reqsize = sizeof(struct aead_request)
-		+ crypto_aead_reqsize(&cryptd_tfm->base);
+	*ctx = cryptd_tfm;
+	crypto_aead_set_reqsize(
+		aead,
+		sizeof(struct aead_request) +
+		crypto_aead_reqsize(&cryptd_tfm->base));
 	return 0;
 }
 
-static void rfc4106_exit(struct crypto_tfm *tfm)
+static void rfc4106_exit(struct crypto_aead *aead)
 {
-	struct aesni_rfc4106_gcm_ctx *ctx =
-		(struct aesni_rfc4106_gcm_ctx *)
-		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
-	if (!IS_ERR(ctx->cryptd_tfm))
-		cryptd_free_aead(ctx->cryptd_tfm);
-	return;
+	struct cryptd_aead **ctx = crypto_aead_ctx(aead);
+
+	cryptd_free_aead(*ctx);
 }
 
 static void
@@ -845,8 +840,6 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 	if (IS_ERR(ctr_tfm))
 		return PTR_ERR(ctr_tfm);
 
-	crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
-
 	ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
 	if (ret)
 		goto out_free_ablkcipher;
@@ -895,73 +888,29 @@ out_free_ablkcipher:
 static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
 				  unsigned int key_len)
 {
-	int ret = 0;
-	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
-	u8 *new_key_align, *new_key_mem = NULL;
 
 	if (key_len < 4) {
-		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 	/*Account for 4 byte nonce at the end.*/
 	key_len -= 4;
-	if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
-	    key_len != AES_KEYSIZE_256) {
-		crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-		return -EINVAL;
-	}
 
 	memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
-	/*This must be on a 16 byte boundary!*/
-	if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
-		return -EINVAL;
-
-	if ((unsigned long)key % AESNI_ALIGN) {
-		/*key is not aligned: use an auxuliar aligned pointer*/
-		new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
-		if (!new_key_mem)
-			return -ENOMEM;
-
-		new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
-		memcpy(new_key_align, key, key_len);
-		key = new_key_align;
-	}
 
-	if (!irq_fpu_usable())
-		ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
-		key, key_len);
-	else {
-		kernel_fpu_begin();
-		ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
-		kernel_fpu_end();
-	}
-	/*This must be on a 16 byte boundary!*/
-	if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
-		ret = -EINVAL;
-		goto exit;
-	}
-	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
-exit:
-	kfree(new_key_mem);
-	return ret;
+	return aes_set_key_common(crypto_aead_tfm(aead),
+				  &ctx->aes_key_expanded, key, key_len) ?:
+	       rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
 }
 
 static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 			   unsigned int key_len)
 {
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
-	struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
-	struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
-	int ret;
+	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
+	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	ret = crypto_aead_setkey(child, key, key_len);
-	if (!ret) {
-		memcpy(ctx, c_ctx, sizeof(*ctx));
-		ctx->cryptd_tfm = cryptd_tfm;
-	}
-	return ret;
+	return crypto_aead_setkey(&cryptd_tfm->base, key, key_len);
 }
 
 static int common_rfc4106_set_authsize(struct crypto_aead *aead,
@@ -975,7 +924,7 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 	default:
 		return -EINVAL;
 	}
-	crypto_aead_crt(aead)->authsize = authsize;
+
 	return 0;
 }
 
@@ -984,30 +933,23 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead,
 static int rfc4106_set_authsize(struct crypto_aead *parent,
 				unsigned int authsize)
 {
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
-	int ret;
+	struct cryptd_aead **ctx = crypto_aead_ctx(parent);
+	struct cryptd_aead *cryptd_tfm = *ctx;
 
-	ret = crypto_aead_setauthsize(child, authsize);
-	if (!ret)
-		crypto_aead_crt(parent)->authsize = authsize;
-	return ret;
+	return crypto_aead_setauthsize(&cryptd_tfm->base, authsize);
 }
 
-static int __driver_rfc4106_encrypt(struct aead_request *req)
+static int helper_rfc4106_encrypt(struct aead_request *req)
 {
 	u8 one_entry_in_sg = 0;
 	u8 *src, *dst, *assoc;
 	__be32 counter = cpu_to_be32(1);
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 iv_tab[16+AESNI_ALIGN];
-	u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+	u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
 	struct scatter_walk src_sg_walk;
-	struct scatter_walk assoc_sg_walk;
 	struct scatter_walk dst_sg_walk;
 	unsigned int i;
 
@@ -1016,12 +958,6 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 	/* to 8 or 12 bytes */
 	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
 		return -EINVAL;
-	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
-	        return -EINVAL;
-	if (unlikely(key_len != AES_KEYSIZE_128 &&
-	             key_len != AES_KEYSIZE_192 &&
-	             key_len != AES_KEYSIZE_256))
-	        return -EINVAL;
 
 	/* IV below built */
 	for (i = 0; i < 4; i++)
@@ -1030,55 +966,57 @@ static int __driver_rfc4106_encrypt(struct aead_request *req)
 		*(iv+4+i) = req->iv[i];
 	*((__be32 *)(iv+12)) = counter;
 
-	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+	if (sg_is_last(req->src) &&
+	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    sg_is_last(req->dst) &&
+	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
-		scatterwalk_start(&assoc_sg_walk, req->assoc);
-		src = scatterwalk_map(&src_sg_walk);
-		assoc = scatterwalk_map(&assoc_sg_walk);
+		assoc = scatterwalk_map(&src_sg_walk);
+		src = assoc + req->assoclen;
 		dst = src;
 		if (unlikely(req->src != req->dst)) {
 			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk);
+			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
 		}
-
 	} else {
 		/* Allocate memory for src, dst, assoc */
-		src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+		assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
 			GFP_ATOMIC);
-		if (unlikely(!src))
+		if (unlikely(!assoc))
 			return -ENOMEM;
-		assoc = (src + req->cryptlen + auth_tag_len);
-		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
-		scatterwalk_map_and_copy(assoc, req->assoc, 0,
-					req->assoclen, 0);
+		scatterwalk_map_and_copy(assoc, req->src, 0,
+					 req->assoclen + req->cryptlen, 0);
+		src = assoc + req->assoclen;
 		dst = src;
 	}
 
+	kernel_fpu_begin();
 	aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
 		ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
 		+ ((unsigned long)req->cryptlen), auth_tag_len);
+	kernel_fpu_end();
 
 	/* The authTag (aka the Integrity Check Value) needs to be written
 	 * back to the packet. */
 	if (one_entry_in_sg) {
 		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst);
-			scatterwalk_done(&dst_sg_walk, 0, 0);
+			scatterwalk_unmap(dst - req->assoclen);
+			scatterwalk_advance(&dst_sg_walk, req->dst->length);
+			scatterwalk_done(&dst_sg_walk, 1, 0);
 		}
-		scatterwalk_unmap(src);
 		scatterwalk_unmap(assoc);
-		scatterwalk_done(&src_sg_walk, 0, 0);
-		scatterwalk_done(&assoc_sg_walk, 0, 0);
+		scatterwalk_advance(&src_sg_walk, req->src->length);
+		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
 	} else {
-		scatterwalk_map_and_copy(dst, req->dst, 0,
-			req->cryptlen + auth_tag_len, 1);
-		kfree(src);
+		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
+					 req->cryptlen + auth_tag_len, 1);
+		kfree(assoc);
 	}
 	return 0;
 }
 
-static int __driver_rfc4106_decrypt(struct aead_request *req)
+static int helper_rfc4106_decrypt(struct aead_request *req)
 {
 	u8 one_entry_in_sg = 0;
 	u8 *src, *dst, *assoc;
@@ -1087,26 +1025,16 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	int retval = 0;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-	u32 key_len = ctx->aes_key_expanded.key_length;
 	void *aes_ctx = &(ctx->aes_key_expanded);
 	unsigned long auth_tag_len = crypto_aead_authsize(tfm);
-	u8 iv_and_authTag[32+AESNI_ALIGN];
-	u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
-	u8 *authTag = iv + 16;
+	u8 iv[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
+	u8 authTag[16];
 	struct scatter_walk src_sg_walk;
-	struct scatter_walk assoc_sg_walk;
 	struct scatter_walk dst_sg_walk;
 	unsigned int i;
 
-	if (unlikely((req->cryptlen < auth_tag_len) ||
-		(req->assoclen != 8 && req->assoclen != 12)))
+	if (unlikely(req->assoclen != 8 && req->assoclen != 12))
 		return -EINVAL;
-	if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16))
-	        return -EINVAL;
-	if (unlikely(key_len != AES_KEYSIZE_128 &&
-	             key_len != AES_KEYSIZE_192 &&
-	             key_len != AES_KEYSIZE_256))
-	        return -EINVAL;
 
 	/* Assuming we are supporting rfc4106 64-bit extended */
 	/* sequence numbers We need to have the AAD length */
@@ -1120,33 +1048,36 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		*(iv+4+i) = req->iv[i];
 	*((__be32 *)(iv+12)) = counter;
 
-	if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+	if (sg_is_last(req->src) &&
+	    req->src->offset + req->src->length <= PAGE_SIZE &&
+	    sg_is_last(req->dst) &&
+	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
-		scatterwalk_start(&assoc_sg_walk, req->assoc);
-		src = scatterwalk_map(&src_sg_walk);
-		assoc = scatterwalk_map(&assoc_sg_walk);
+		assoc = scatterwalk_map(&src_sg_walk);
+		src = assoc + req->assoclen;
 		dst = src;
 		if (unlikely(req->src != req->dst)) {
 			scatterwalk_start(&dst_sg_walk, req->dst);
-			dst = scatterwalk_map(&dst_sg_walk);
+			dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
 		}
 
 	} else {
 		/* Allocate memory for src, dst, assoc */
-		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
-		if (!src)
+		assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+		if (!assoc)
 			return -ENOMEM;
-		assoc = (src + req->cryptlen);
-		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
-		scatterwalk_map_and_copy(assoc, req->assoc, 0,
-			req->assoclen, 0);
+		scatterwalk_map_and_copy(assoc, req->src, 0,
+					 req->assoclen + req->cryptlen, 0);
+		src = assoc + req->assoclen;
 		dst = src;
 	}
 
+	kernel_fpu_begin();
 	aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv,
 		ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
 		authTag, auth_tag_len);
+	kernel_fpu_end();
 
 	/* Compare generated tag with passed in tag. */
 	retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
@@ -1154,90 +1085,59 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 
 	if (one_entry_in_sg) {
 		if (unlikely(req->src != req->dst)) {
-			scatterwalk_unmap(dst);
-			scatterwalk_done(&dst_sg_walk, 0, 0);
+			scatterwalk_unmap(dst - req->assoclen);
+			scatterwalk_advance(&dst_sg_walk, req->dst->length);
+			scatterwalk_done(&dst_sg_walk, 1, 0);
 		}
-		scatterwalk_unmap(src);
 		scatterwalk_unmap(assoc);
-		scatterwalk_done(&src_sg_walk, 0, 0);
-		scatterwalk_done(&assoc_sg_walk, 0, 0);
+		scatterwalk_advance(&src_sg_walk, req->src->length);
+		scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
 	} else {
-		scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1);
-		kfree(src);
+		scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
+					 tempCipherLen, 1);
+		kfree(assoc);
 	}
 	return retval;
 }
 
 static int rfc4106_encrypt(struct aead_request *req)
 {
-	int ret;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+	struct aead_request *subreq = aead_request_ctx(req);
 
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
+	aead_request_set_tfm(subreq, irq_fpu_usable() ?
+				     cryptd_aead_child(cryptd_tfm) :
+				     &cryptd_tfm->base);
 
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		ret = crypto_aead_encrypt(cryptd_req);
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_encrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
+	aead_request_set_callback(subreq, req->base.flags,
+				  req->base.complete, req->base.data);
+	aead_request_set_crypt(subreq, req->src, req->dst,
+			       req->cryptlen, req->iv);
+	aead_request_set_ad(subreq, req->assoclen);
+
+	return crypto_aead_encrypt(subreq);
 }
 
 static int rfc4106_decrypt(struct aead_request *req)
 {
-	int ret;
 	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+	struct cryptd_aead **ctx = crypto_aead_ctx(tfm);
+	struct cryptd_aead *cryptd_tfm = *ctx;
+	struct aead_request *subreq = aead_request_ctx(req);
 
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
+	aead_request_set_tfm(subreq, irq_fpu_usable() ?
+				     cryptd_aead_child(cryptd_tfm) :
+				     &cryptd_tfm->base);
 
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		ret = crypto_aead_decrypt(cryptd_req);
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_decrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
-}
-
-static int helper_rfc4106_encrypt(struct aead_request *req)
-{
-	int ret;
-
-	if (unlikely(!irq_fpu_usable())) {
-		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
-		ret = -EINVAL;
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_encrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
-}
-
-static int helper_rfc4106_decrypt(struct aead_request *req)
-{
-	int ret;
+	aead_request_set_callback(subreq, req->base.flags,
+				  req->base.complete, req->base.data);
+	aead_request_set_crypt(subreq, req->src, req->dst,
+			       req->cryptlen, req->iv);
+	aead_request_set_ad(subreq, req->assoclen);
 
-	if (unlikely(!irq_fpu_usable())) {
-		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
-		ret = -EINVAL;
-	} else {
-		kernel_fpu_begin();
-		ret = __driver_rfc4106_decrypt(req);
-		kernel_fpu_end();
-	}
-	return ret;
+	return crypto_aead_decrypt(subreq);
 }
 #endif
 
@@ -1410,51 +1310,6 @@ static struct crypto_alg aesni_algs[] = { {
 			.geniv		= "chainiv",
 		},
 	},
-}, {
-	.cra_name		= "__gcm-aes-aesni",
-	.cra_driver_name	= "__driver-gcm-aes-aesni",
-	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
-				  AESNI_ALIGN,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_aead_type,
-	.cra_module		= THIS_MODULE,
-	.cra_u = {
-		.aead = {
-			.setkey		= common_rfc4106_set_key,
-			.setauthsize	= common_rfc4106_set_authsize,
-			.encrypt	= helper_rfc4106_encrypt,
-			.decrypt	= helper_rfc4106_decrypt,
-			.ivsize		= 8,
-			.maxauthsize	= 16,
-		},
-	},
-}, {
-	.cra_name		= "rfc4106(gcm(aes))",
-	.cra_driver_name	= "rfc4106-gcm-aesni",
-	.cra_priority		= 400,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
-	.cra_blocksize		= 1,
-	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
-				  AESNI_ALIGN,
-	.cra_alignmask		= 0,
-	.cra_type		= &crypto_nivaead_type,
-	.cra_module		= THIS_MODULE,
-	.cra_init		= rfc4106_init,
-	.cra_exit		= rfc4106_exit,
-	.cra_u = {
-		.aead = {
-			.setkey		= rfc4106_set_key,
-			.setauthsize	= rfc4106_set_authsize,
-			.encrypt	= rfc4106_encrypt,
-			.decrypt	= rfc4106_decrypt,
-			.geniv		= "seqiv",
-			.ivsize		= 8,
-			.maxauthsize	= 16,
-		},
-	},
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_PCBC)
 }, {
@@ -1569,6 +1424,46 @@ static struct crypto_alg aesni_algs[] = { {
 	},
 } };
 
+#ifdef CONFIG_X86_64
+static struct aead_alg aesni_aead_algs[] = { {
+	.setkey			= common_rfc4106_set_key,
+	.setauthsize		= common_rfc4106_set_authsize,
+	.encrypt		= helper_rfc4106_encrypt,
+	.decrypt		= helper_rfc4106_decrypt,
+	.ivsize			= 8,
+	.maxauthsize		= 16,
+	.base = {
+		.cra_name		= "__gcm-aes-aesni",
+		.cra_driver_name	= "__driver-gcm-aes-aesni",
+		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx),
+		.cra_alignmask		= AESNI_ALIGN - 1,
+		.cra_module		= THIS_MODULE,
+	},
+}, {
+	.init			= rfc4106_init,
+	.exit			= rfc4106_exit,
+	.setkey			= rfc4106_set_key,
+	.setauthsize		= rfc4106_set_authsize,
+	.encrypt		= rfc4106_encrypt,
+	.decrypt		= rfc4106_decrypt,
+	.ivsize			= 8,
+	.maxauthsize		= 16,
+	.base = {
+		.cra_name		= "rfc4106(gcm(aes))",
+		.cra_driver_name	= "rfc4106-gcm-aesni",
+		.cra_priority		= 400,
+		.cra_flags		= CRYPTO_ALG_ASYNC,
+		.cra_blocksize		= 1,
+		.cra_ctxsize		= sizeof(struct cryptd_aead *),
+		.cra_module		= THIS_MODULE,
+	},
+} };
+#else
+static struct aead_alg aesni_aead_algs[0];
+#endif
+
 
 static const struct x86_cpu_id aesni_cpu_id[] = {
 	X86_FEATURE_MATCH(X86_FEATURE_AES),
@@ -1616,11 +1511,27 @@ static int __init aesni_init(void)
 	if (err)
 		return err;
 
-	return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+	err = crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+	if (err)
+		goto fpu_exit;
+
+	err = crypto_register_aeads(aesni_aead_algs,
+				    ARRAY_SIZE(aesni_aead_algs));
+	if (err)
+		goto unregister_algs;
+
+	return err;
+
+unregister_algs:
+	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
+fpu_exit:
+	crypto_fpu_exit();
+	return err;
 }
 
 static void __exit aesni_exit(void)
 {
+	crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
 	crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
 
 	crypto_fpu_exit();
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index baf0ac21ace5..4c65c70e628b 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -19,8 +19,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -561,16 +560,15 @@ static struct crypto_alg cmll_algs[10] = { {
 
 static int __init camellia_aesni_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
 		pr_info("AVX2 or AES-NI instructions are not detected.\n");
 		return -ENODEV;
 	}
 
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX2 detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 78818a1e73e3..80a0e4389c9a 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -19,8 +19,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/camellia.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -553,16 +552,10 @@ static struct crypto_alg cmll_algs[10] = { {
 
 static int __init camellia_aesni_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
-		pr_info("AVX or AES-NI instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 236c80974457..be00aa48b2b5 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -31,8 +31,7 @@
 #include <crypto/cast5.h>
 #include <crypto/cryptd.h>
 #include <crypto/ctr.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/glue_helper.h>
 
 #define CAST5_PARALLEL_BLOCKS 16
@@ -468,16 +467,10 @@ static struct crypto_alg cast5_algs[6] = { {
 
 static int __init cast5_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		pr_info("AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index f448810ca4ac..5dbba7224221 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -36,8 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/glue_helper.h>
 
 #define CAST6_PARALLEL_BLOCKS 8
@@ -590,16 +589,10 @@ static struct crypto_alg cast6_algs[10] = { {
 
 static int __init cast6_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		pr_info("AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 1937fc1d8763..07d2c6c86a54 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -35,7 +35,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 28640c3d6af7..81a595d75cf5 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,8 +32,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 
 #define CHKSUM_BLOCK_SIZE	1
 #define CHKSUM_DIGEST_SIZE	4
diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c
index b6c67bf30fdf..a3fcfc97a311 100644
--- a/arch/x86/crypto/crct10dif-pclmul_glue.c
+++ b/arch/x86/crypto/crct10dif-pclmul_glue.c
@@ -29,7 +29,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/cpufeature.h>
 #include <asm/cpu_device_id.h>
 
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index f368ba261739..e7d679e2a018 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -18,7 +18,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 struct crypto_fpu_ctx {
 	struct crypto_blkcipher *child;
@@ -156,7 +156,7 @@ int __init crypto_fpu_init(void)
 	return crypto_register_template(&crypto_fpu_tmpl);
 }
 
-void __exit crypto_fpu_exit(void)
+void crypto_fpu_exit(void)
 {
 	crypto_unregister_template(&crypto_fpu_tmpl);
 }
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 2079baf06bdd..64d7cf1b50e1 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -19,7 +19,7 @@
 #include <crypto/cryptd.h>
 #include <crypto/gf128mul.h>
 #include <crypto/internal/hash.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/cpu_device_id.h>
 
 #define GHASH_BLOCK_SIZE	16
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 2f63dc89e7a9..7d838dc4d888 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -20,8 +20,7 @@
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
 #include <crypto/serpent.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/serpent-avx.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -537,16 +536,14 @@ static struct crypto_alg srp_algs[10] = { {
 
 static int __init init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
 	if (!cpu_has_avx2 || !cpu_has_osxsave) {
 		pr_info("AVX2 instructions are not detected.\n");
 		return -ENODEV;
 	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index c8d478af8456..da7dafc9b16d 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -36,8 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/serpent-avx.h>
 #include <asm/crypto/glue_helper.h>
 
@@ -596,16 +595,10 @@ static struct crypto_alg serpent_algs[10] = { {
 
 static int __init serpent_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		printk(KERN_INFO "AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		printk(KERN_INFO "AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index e510b1c5d690..a841e9765bd6 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -65,11 +65,8 @@
 #include <crypto/mcryptd.h>
 #include <crypto/crypto_wq.h>
 #include <asm/byteorder.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
 #include <linux/hardirq.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include "sha_mb_ctx.h"
 
 #define FLUSH_INTERVAL 1000 /* in usec */
@@ -885,7 +882,8 @@ static int __init sha1_mb_mod_init(void)
 		INIT_DELAYED_WORK(&cpu_state->flush, mcryptd_flusher);
 		cpu_state->cpu = cpu;
 		cpu_state->alg_state = &sha1_mb_alg_state;
-		cpu_state->mgr = (struct sha1_ctx_mgr *) kzalloc(sizeof(struct sha1_ctx_mgr), GFP_KERNEL);
+		cpu_state->mgr = kzalloc(sizeof(struct sha1_ctx_mgr),
+					GFP_KERNEL);
 		if (!cpu_state->mgr)
 			goto err2;
 		sha1_ctx_mgr_init(cpu_state->mgr);
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 33d1b9dc14cc..7c48e8b20848 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -29,9 +29,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 
 
 asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
@@ -123,15 +121,9 @@ static struct shash_alg alg = {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index ccc338881ee8..f8097fc0d1d1 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -37,9 +37,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <linux/string.h>
 
 asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
@@ -132,15 +130,9 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index d9fa4c1e063f..2edad7b81870 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -35,9 +35,7 @@
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha512_base.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 
 #include <linux/string.h>
 
@@ -131,15 +129,9 @@ static struct shash_alg algs[] = { {
 #ifdef CONFIG_AS_AVX
 static bool __init avx_usable(void)
 {
-	u64 xcr0;
-
-	if (!cpu_has_avx || !cpu_has_osxsave)
-		return false;
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		pr_info("AVX detected but unusable.\n");
-
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, NULL)) {
+		if (cpu_has_avx)
+			pr_info("AVX detected but unusable.\n");
 		return false;
 	}
 
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index b5e2d5651851..c2bd0ce718ee 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -36,9 +36,7 @@
 #include <crypto/ctr.h>
 #include <crypto/lrw.h>
 #include <crypto/xts.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
-#include <asm/xsave.h>
+#include <asm/fpu/api.h>
 #include <asm/crypto/twofish.h>
 #include <asm/crypto/glue_helper.h>
 #include <crypto/scatterwalk.h>
@@ -558,16 +556,10 @@ static struct crypto_alg twofish_algs[10] = { {
 
 static int __init twofish_init(void)
 {
-	u64 xcr0;
+	const char *feature_name;
 
-	if (!cpu_has_avx || !cpu_has_osxsave) {
-		printk(KERN_INFO "AVX instructions are not detected.\n");
-		return -ENODEV;
-	}
-
-	xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
-	if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
-		printk(KERN_INFO "AVX detected but unusable.\n");
+	if (!cpu_has_xfeatures(XSTATE_SSE | XSTATE_YMM, &feature_name)) {
+		pr_info("CPU feature '%s' is not supported.\n", feature_name);
 		return -ENODEV;
 	}
 
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
new file mode 100644
index 000000000000..7a144971db79
--- /dev/null
+++ b/arch/x86/entry/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the x86 low level entry code
+#
+obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+
+obj-y				+= vdso/
+obj-y				+= vsyscall/
+
+obj-$(CONFIG_IA32_EMULATION)	+= entry_64_compat.o syscall_32.o
+
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/entry/calling.h
index 1c8b50edb2db..f4e6308c4200 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/entry/calling.h
@@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with
 
 */
 
-#include <asm/dwarf2.h>
-
 #ifdef CONFIG_X86_64
 
 /*
@@ -91,28 +89,27 @@ For 32-bit we have the following conventions - kernel is built with
 #define SIZEOF_PTREGS	21*8
 
 	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
-	subq	$15*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+	addq	$-(15*8+\addskip), %rsp
 	.endm
 
 	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
 	.if \r11
-	movq_cfi r11, 6*8+\offset
+	movq %r11, 6*8+\offset(%rsp)
 	.endif
 	.if \r8910
-	movq_cfi r10, 7*8+\offset
-	movq_cfi r9,  8*8+\offset
-	movq_cfi r8,  9*8+\offset
+	movq %r10, 7*8+\offset(%rsp)
+	movq %r9,  8*8+\offset(%rsp)
+	movq %r8,  9*8+\offset(%rsp)
 	.endif
 	.if \rax
-	movq_cfi rax, 10*8+\offset
+	movq %rax, 10*8+\offset(%rsp)
 	.endif
 	.if \rcx
-	movq_cfi rcx, 11*8+\offset
+	movq %rcx, 11*8+\offset(%rsp)
 	.endif
-	movq_cfi rdx, 12*8+\offset
-	movq_cfi rsi, 13*8+\offset
-	movq_cfi rdi, 14*8+\offset
+	movq %rdx, 12*8+\offset(%rsp)
+	movq %rsi, 13*8+\offset(%rsp)
+	movq %rdi, 14*8+\offset(%rsp)
 	.endm
 	.macro SAVE_C_REGS offset=0
 	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with
 	.endm
 
 	.macro SAVE_EXTRA_REGS offset=0
-	movq_cfi r15, 0*8+\offset
-	movq_cfi r14, 1*8+\offset
-	movq_cfi r13, 2*8+\offset
-	movq_cfi r12, 3*8+\offset
-	movq_cfi rbp, 4*8+\offset
-	movq_cfi rbx, 5*8+\offset
+	movq %r15, 0*8+\offset(%rsp)
+	movq %r14, 1*8+\offset(%rsp)
+	movq %r13, 2*8+\offset(%rsp)
+	movq %r12, 3*8+\offset(%rsp)
+	movq %rbp, 4*8+\offset(%rsp)
+	movq %rbx, 5*8+\offset(%rsp)
 	.endm
 	.macro SAVE_EXTRA_REGS_RBP offset=0
-	movq_cfi rbp, 4*8+\offset
+	movq %rbp, 4*8+\offset(%rsp)
 	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
-	movq_cfi_restore 0*8+\offset, r15
-	movq_cfi_restore 1*8+\offset, r14
-	movq_cfi_restore 2*8+\offset, r13
-	movq_cfi_restore 3*8+\offset, r12
-	movq_cfi_restore 4*8+\offset, rbp
-	movq_cfi_restore 5*8+\offset, rbx
+	movq 0*8+\offset(%rsp), %r15
+	movq 1*8+\offset(%rsp), %r14
+	movq 2*8+\offset(%rsp), %r13
+	movq 3*8+\offset(%rsp), %r12
+	movq 4*8+\offset(%rsp), %rbp
+	movq 5*8+\offset(%rsp), %rbx
 	.endm
 
 	.macro ZERO_EXTRA_REGS
@@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with
 
 	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
 	.if \rstor_r11
-	movq_cfi_restore 6*8, r11
+	movq 6*8(%rsp), %r11
 	.endif
 	.if \rstor_r8910
-	movq_cfi_restore 7*8, r10
-	movq_cfi_restore 8*8, r9
-	movq_cfi_restore 9*8, r8
+	movq 7*8(%rsp), %r10
+	movq 8*8(%rsp), %r9
+	movq 9*8(%rsp), %r8
 	.endif
 	.if \rstor_rax
-	movq_cfi_restore 10*8, rax
+	movq 10*8(%rsp), %rax
 	.endif
 	.if \rstor_rcx
-	movq_cfi_restore 11*8, rcx
+	movq 11*8(%rsp), %rcx
 	.endif
 	.if \rstor_rdx
-	movq_cfi_restore 12*8, rdx
+	movq 12*8(%rsp), %rdx
 	.endif
-	movq_cfi_restore 13*8, rsi
-	movq_cfi_restore 14*8, rdi
+	movq 13*8(%rsp), %rsi
+	movq 14*8(%rsp), %rdi
 	.endm
 	.macro RESTORE_C_REGS
 	RESTORE_C_REGS_HELPER 1,1,1,1,1
@@ -204,8 +201,7 @@ For 32-bit we have the following conventions - kernel is built with
 	.endm
 
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
-	addq $15*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
+	subq $-(15*8+\addskip), %rsp
 	.endm
 
 	.macro icebp
@@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with
  */
 
 	.macro SAVE_ALL
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
+	pushl %eax
+	pushl %ebp
+	pushl %edi
+	pushl %esi
+	pushl %edx
+	pushl %ecx
+	pushl %ebx
 	.endm
 
 	.macro RESTORE_ALL
-	popl_cfi_reg ebx
-	popl_cfi_reg ecx
-	popl_cfi_reg edx
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi_reg ebp
-	popl_cfi_reg eax
+	popl %ebx
+	popl %ecx
+	popl %edx
+	popl %esi
+	popl %edi
+	popl %ebp
+	popl %eax
 	.endm
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000000..21dc60a60b5f
--- /dev/null
+++ b/arch/x86/entry/entry_32.S
@@ -0,0 +1,1248 @@
+/*
+ *  Copyright (C) 1991,1992  Linus Torvalds
+ *
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
+ *
+ * Stack layout in 'syscall_exit':
+ *	ptrace needs to have all registers on the stack.
+ *	If the order here is changed, it needs to be
+ *	updated in fork.c:copy_process(), signal.c:do_signal(),
+ *	ptrace.c and ptrace.h
+ *
+ *	 0(%esp) - %ebx
+ *	 4(%esp) - %ecx
+ *	 8(%esp) - %edx
+ *	 C(%esp) - %esi
+ *	10(%esp) - %edi
+ *	14(%esp) - %ebp
+ *	18(%esp) - %eax
+ *	1C(%esp) - %ds
+ *	20(%esp) - %es
+ *	24(%esp) - %fs
+ *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
+ *	2C(%esp) - orig_eax
+ *	30(%esp) - %eip
+ *	34(%esp) - %cs
+ *	38(%esp) - %eflags
+ *	3C(%esp) - %oldesp
+ *	40(%esp) - %oldss
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE		0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+# define sysenter_audit		syscall_trace_entry
+# define sysexit_audit		syscall_exit_work
+#endif
+
+	.section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization.  The following will never clobber any registers:
+ *   INTERRUPT_RETURN (aka. "iret")
+ *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#ifdef CONFIG_PREEMPT
+# define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+# define preempt_stop(clobbers)
+# define resume_kernel		restore_all
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)     # interrupts off?
+	jz	1f
+	TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc.  Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+	pushl	$0
+.endm
+.macro POP_GS pop=0
+	addl	$(4 + \pop), %esp
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else	/* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+	pushl	%gs
+.endm
+
+.macro POP_GS pop=0
+98:	popl	%gs
+  .if \pop <> 0
+	add	$\pop, %esp
+  .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99:	movl	$0, (%esp)
+	jmp	98b
+.popsection
+	_ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro PTGS_TO_GS
+98:	mov	PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99:	movl	$0, PT_GS(%esp)
+	jmp	98b
+.popsection
+	_ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro GS_TO_REG reg
+	movl	%gs, \reg
+.endm
+.macro REG_TO_PTGS reg
+	movl	\reg, PT_GS(%esp)
+.endm
+.macro SET_KERNEL_GS reg
+	movl	$(__KERNEL_STACK_CANARY), \reg
+	movl	\reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+	cld
+	PUSH_GS
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	movl	$(__USER_DS), %edx
+	movl	%edx, %ds
+	movl	%edx, %es
+	movl	$(__KERNEL_PERCPU), %edx
+	movl	%edx, %fs
+	SET_KERNEL_GS %edx
+.endm
+
+.macro RESTORE_INT_REGS
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
+.endm
+
+.macro RESTORE_REGS pop=0
+	RESTORE_INT_REGS
+1:	popl	%ds
+2:	popl	%es
+3:	popl	%fs
+	POP_GS \pop
+.pushsection .fixup, "ax"
+4:	movl	$0, (%esp)
+	jmp	1b
+5:	movl	$0, (%esp)
+	jmp	2b
+6:	movl	$0, (%esp)
+	jmp	3b
+.popsection
+	_ASM_EXTABLE(1b, 4b)
+	_ASM_EXTABLE(2b, 5b)
+	_ASM_EXTABLE(3b, 6b)
+	POP_GS_EX
+.endm
+
+ENTRY(ret_from_fork)
+	pushl	%eax
+	call	schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl	%eax
+	pushl	$0x0202				# Reset kernel eflags
+	popfl
+	jmp	syscall_exit
+END(ret_from_fork)
+
+ENTRY(ret_from_kernel_thread)
+	pushl	%eax
+	call	schedule_tail
+	GET_THREAD_INFO(%ebp)
+	popl	%eax
+	pushl	$0x0202				# Reset kernel eflags
+	popfl
+	movl	PT_EBP(%esp), %eax
+	call	*PT_EBX(%esp)
+	movl	$0, PT_EAX(%esp)
+	jmp	syscall_exit
+ENDPROC(ret_from_kernel_thread)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+	# userspace resumption stub bypassing syscall exit tracing
+	ALIGN
+ret_from_exception:
+	preempt_stop(CLBR_ANY)
+ret_from_intr:
+	GET_THREAD_INFO(%ebp)
+#ifdef CONFIG_VM86
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS and CS
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+	/*
+	 * We can be coming here from child spawned by kernel_thread().
+	 */
+	movl	PT_CS(%esp), %eax
+	andl	$SEGMENT_RPL_MASK, %eax
+#endif
+	cmpl	$USER_RPL, %eax
+	jb	resume_kernel			# not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
+						# setting need_resched or sigpending
+						# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl	TI_flags(%ebp), %ecx
+	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done on
+						# int/exception return?
+	jne	work_pending
+	jmp	restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+	DISABLE_INTERRUPTS(CLBR_ANY)
+need_resched:
+	cmpl	$0, PER_CPU_VAR(__preempt_count)
+	jnz	restore_all
+	testl	$X86_EFLAGS_IF, PT_EFLAGS(%esp)	# interrupts off (exception path) ?
+	jz	restore_all
+	call	preempt_schedule_irq
+	jmp	need_resched
+END(resume_kernel)
+#endif
+
+/*
+ * SYSENTER_RETURN points to after the SYSENTER instruction
+ * in the vsyscall page.  See vsyscall-sysentry.S, which defines
+ * the symbol.
+ */
+
+	# SYSENTER  call handler stub
+ENTRY(entry_SYSENTER_32)
+	movl	TSS_sysenter_sp0(%esp), %esp
+sysenter_past_esp:
+	/*
+	 * Interrupts are disabled here, but we can't trace it until
+	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
+	 * we immediately enable interrupts at that point anyway.
+	 */
+	pushl	$__USER_DS
+	pushl	%ebp
+	pushfl
+	orl	$X86_EFLAGS_IF, (%esp)
+	pushl	$__USER_CS
+	/*
+	 * Push current_thread_info()->sysenter_return to the stack.
+	 * A tiny bit of offset fixup is necessary: TI_sysenter_return
+	 * is relative to thread_info, which is at the bottom of the
+	 * kernel stack page.  4*4 means the 4 words pushed above;
+	 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+	 * and THREAD_SIZE takes us to the bottom.
+	 */
+	pushl	((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
+
+	pushl	%eax
+	SAVE_ALL
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+	cmpl	$__PAGE_OFFSET-3, %ebp
+	jae	syscall_fault
+	ASM_STAC
+1:	movl	(%ebp), %ebp
+	ASM_CLAC
+	movl	%ebp, PT_EBP(%esp)
+	_ASM_EXTABLE(1b, syscall_fault)
+
+	GET_THREAD_INFO(%ebp)
+
+	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+	jnz	sysenter_audit
+sysenter_do_call:
+	cmpl	$(NR_syscalls), %eax
+	jae	sysenter_badsys
+	call	*sys_call_table(, %eax, 4)
+sysenter_after_call:
+	movl	%eax, PT_EAX(%esp)
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl	TI_flags(%ebp), %ecx
+	testl	$_TIF_ALLWORK_MASK, %ecx
+	jnz	sysexit_audit
+sysenter_exit:
+/* if something modifies registers it must also disable sysexit */
+	movl	PT_EIP(%esp), %edx
+	movl	PT_OLDESP(%esp), %ecx
+	xorl	%ebp, %ebp
+	TRACE_IRQS_ON
+1:	mov	PT_FS(%esp), %fs
+	PTGS_TO_GS
+	ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
+	jnz	syscall_trace_entry
+	/* movl	PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
+	movl	PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
+	/* movl	PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
+	pushl	PT_ESI(%esp)			/* a3: 5th arg */
+	pushl	PT_EDX+4(%esp)			/* a2: 4th arg */
+	call	__audit_syscall_entry
+	popl	%ecx				/* get that remapped edx off the stack */
+	popl	%ecx				/* get that remapped esi off the stack */
+	movl	PT_EAX(%esp), %eax		/* reload syscall number */
+	jmp	sysenter_do_call
+
+sysexit_audit:
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+	jnz	syscall_exit_work
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)
+	movl	%eax, %edx			/* second arg, syscall return value */
+	cmpl	$-MAX_ERRNO, %eax		/* is it an error ? */
+	setbe %al				/* 1 if so, 0 if not */
+	movzbl %al, %eax			/* zero-extend that */
+	call	__audit_syscall_exit
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl	TI_flags(%ebp), %ecx
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+	jnz	syscall_exit_work
+	movl	PT_EAX(%esp), %eax		/* reload syscall return value */
+	jmp	sysenter_exit
+#endif
+
+.pushsection .fixup, "ax"
+2:	movl	$0, PT_FS(%esp)
+	jmp	1b
+.popsection
+	_ASM_EXTABLE(1b, 2b)
+	PTGS_TO_GS_EX
+ENDPROC(entry_SYSENTER_32)
+
+	# system call handler stub
+ENTRY(entry_INT80_32)
+	ASM_CLAC
+	pushl	%eax				# save orig_eax
+	SAVE_ALL
+	GET_THREAD_INFO(%ebp)
+						# system call tracing in operation / emulation
+	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+	jnz	syscall_trace_entry
+	cmpl	$(NR_syscalls), %eax
+	jae	syscall_badsys
+syscall_call:
+	call	*sys_call_table(, %eax, 4)
+syscall_after_call:
+	movl	%eax, PT_EAX(%esp)		# store the return value
+syscall_exit:
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
+						# setting need_resched or sigpending
+						# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl	TI_flags(%ebp), %ecx
+	testl	$_TIF_ALLWORK_MASK, %ecx	# current->work
+	jnz	syscall_exit_work
+
+restore_all:
+	TRACE_IRQS_IRET
+restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
+	movl	PT_EFLAGS(%esp), %eax		# mix EFLAGS, SS and CS
+	/*
+	 * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+	 * are returning to the kernel.
+	 * See comments in process.c:copy_thread() for details.
+	 */
+	movb	PT_OLDSS(%esp), %ah
+	movb	PT_CS(%esp), %al
+	andl	$(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+	cmpl	$((SEGMENT_LDT << 8) | USER_RPL), %eax
+	je ldt_ss				# returning to user-space with LDT SS
+#endif
+restore_nocheck:
+	RESTORE_REGS 4				# skip orig_eax/error_code
+irq_return:
+	INTERRUPT_RETURN
+.section .fixup, "ax"
+ENTRY(iret_exc	)
+	pushl	$0				# no error code
+	pushl	$do_iret_error
+	jmp	error_code
+.previous
+	_ASM_EXTABLE(irq_return, iret_exc)
+
+#ifdef CONFIG_X86_ESPFIX32
+ldt_ss:
+#ifdef CONFIG_PARAVIRT
+	/*
+	 * The kernel can't run on a non-flat stack if paravirt mode
+	 * is active.  Rather than try to fixup the high bits of
+	 * ESP, bypass this code entirely.  This may break DOSemu
+	 * and/or Wine support in a paravirt VM, although the option
+	 * is still available to implement the setting of the high
+	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
+	 */
+	cmpl	$0, pv_info+PARAVIRT_enabled
+	jne	restore_nocheck
+#endif
+
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+	mov	%esp, %edx			/* load kernel esp */
+	mov	PT_OLDESP(%esp), %eax		/* load userspace esp */
+	mov	%dx, %ax			/* eax: new kernel esp */
+	sub	%eax, %edx			/* offset (low word is 0) */
+	shr	$16, %edx
+	mov	%dl, GDT_ESPFIX_SS + 4		/* bits 16..23 */
+	mov	%dh, GDT_ESPFIX_SS + 7		/* bits 24..31 */
+	pushl	$__ESPFIX_SS
+	pushl	%eax				/* new kernel esp */
+	/*
+	 * Disable interrupts, but do not irqtrace this section: we
+	 * will soon execute iret and the tracer was already set to
+	 * the irqstate after the IRET:
+	 */
+	DISABLE_INTERRUPTS(CLBR_EAX)
+	lss	(%esp), %esp			/* switch to espfix segment */
+	jmp	restore_nocheck
+#endif
+ENDPROC(entry_INT80_32)
+
+	# perform work that needs to be done immediately before resumption
+	ALIGN
+work_pending:
+	testb	$_TIF_NEED_RESCHED, %cl
+	jz	work_notifysig
+work_resched:
+	call	schedule
+	LOCKDEP_SYS_EXIT
+	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
+						# setting need_resched or sigpending
+						# between sampling and the iret
+	TRACE_IRQS_OFF
+	movl	TI_flags(%ebp), %ecx
+	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done other
+						# than syscall tracing?
+	jz	restore_all
+	testb	$_TIF_NEED_RESCHED, %cl
+	jnz	work_resched
+
+work_notifysig:					# deal with pending signals and
+						# notify-resume requests
+#ifdef CONFIG_VM86
+	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
+	movl	%esp, %eax
+	jnz	work_notifysig_v86		# returning to kernel-space or
+						# vm86-space
+1:
+#else
+	movl	%esp, %eax
+#endif
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	movb	PT_CS(%esp), %bl
+	andb	$SEGMENT_RPL_MASK, %bl
+	cmpb	$USER_RPL, %bl
+	jb	resume_kernel
+	xorl	%edx, %edx
+	call	do_notify_resume
+	jmp	resume_userspace
+
+#ifdef CONFIG_VM86
+	ALIGN
+work_notifysig_v86:
+	pushl	%ecx				# save ti_flags for do_notify_resume
+	call	save_v86_state			# %eax contains pt_regs pointer
+	popl	%ecx
+	movl	%eax, %esp
+	jmp	1b
+#endif
+END(work_pending)
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_trace_entry:
+	movl	$-ENOSYS, PT_EAX(%esp)
+	movl	%esp, %eax
+	call	syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
+	cmpl	$(NR_syscalls), %eax
+	jnae	syscall_call
+	jmp	syscall_exit
+END(syscall_trace_entry)
+
+	# perform syscall exit tracing
+	ALIGN
+syscall_exit_work:
+	testl	$_TIF_WORK_SYSCALL_EXIT, %ecx
+	jz	work_pending
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)		# could let syscall_trace_leave() call
+						# schedule() instead
+	movl	%esp, %eax
+	call	syscall_trace_leave
+	jmp	resume_userspace
+END(syscall_exit_work)
+
+syscall_fault:
+	ASM_CLAC
+	GET_THREAD_INFO(%ebp)
+	movl	$-EFAULT, PT_EAX(%esp)
+	jmp	resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+	movl	$-ENOSYS, %eax
+	jmp	syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+	movl	$-ENOSYS, %eax
+	jmp	sysenter_after_call
+END(sysenter_badsys)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+	/* fixup the stack */
+	mov	GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+	mov	GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+	shl	$16, %eax
+	addl	%esp, %eax			/* the adjusted stack pointer */
+	pushl	$__KERNEL_DS
+	pushl	%eax
+	lss	(%esp), %esp			/* switch to the normal stack segment */
+#endif
+.endm
+.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
+	movl	%ss, %eax
+	/* see if on espfix stack */
+	cmpw	$__ESPFIX_SS, %ax
+	jne	27f
+	movl	$__KERNEL_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	/* switch to normal stack */
+	FIXUP_ESPFIX_STACK
+27:
+#endif
+.endm
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+	.align 8
+ENTRY(irq_entries_start)
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+	pushl	$(~vector+0x80)			/* Note: always in signed byte range */
+    vector=vector+1
+	jmp	common_interrupt
+	.align	8
+    .endr
+END(irq_entries_start)
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+	.p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+	ASM_CLAC
+	addl	$-0x80, (%esp)			/* Adjust vector into the [-256, -1] range */
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	movl	%esp, %eax
+	call	do_IRQ
+	jmp	ret_from_intr
+ENDPROC(common_interrupt)
+
+#define BUILD_INTERRUPT3(name, nr, fn)	\
+ENTRY(name)				\
+	ASM_CLAC;			\
+	pushl	$~(nr);			\
+	SAVE_ALL;			\
+	TRACE_IRQS_OFF			\
+	movl	%esp, %eax;		\
+	call	fn;			\
+	jmp	ret_from_intr;		\
+ENDPROC(name)
+
+
+#ifdef CONFIG_TRACING
+# define TRACE_BUILD_INTERRUPT(name, nr)	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+#else
+# define TRACE_BUILD_INTERRUPT(name, nr)
+#endif
+
+#define BUILD_INTERRUPT(name, nr)		\
+	BUILD_INTERRUPT3(name, nr, smp_##name);	\
+	TRACE_BUILD_INTERRUPT(name, nr)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+ENTRY(coprocessor_error)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_coprocessor_error
+	jmp	error_code
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+	ASM_CLAC
+	pushl	$0
+#ifdef CONFIG_X86_INVD_BUG
+	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+	ALTERNATIVE "pushl	$do_general_protection",	\
+		    "pushl	$do_simd_coprocessor_error",	\
+		    X86_FEATURE_XMM
+#else
+	pushl	$do_simd_coprocessor_error
+#endif
+	jmp	error_code
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+	ASM_CLAC
+	pushl	$-1				# mark this as an int
+	pushl	$do_device_not_available
+	jmp	error_code
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+	iret
+	_ASM_EXTABLE(native_iret, iret_exc)
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+	sti
+	sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+ENTRY(overflow)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_overflow
+	jmp	error_code
+END(overflow)
+
+ENTRY(bounds)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_bounds
+	jmp	error_code
+END(bounds)
+
+ENTRY(invalid_op)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_invalid_op
+	jmp	error_code
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_coprocessor_segment_overrun
+	jmp	error_code
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+	ASM_CLAC
+	pushl	$do_invalid_TSS
+	jmp	error_code
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+	ASM_CLAC
+	pushl	$do_segment_not_present
+	jmp	error_code
+END(segment_not_present)
+
+ENTRY(stack_segment)
+	ASM_CLAC
+	pushl	$do_stack_segment
+	jmp	error_code
+END(stack_segment)
+
+ENTRY(alignment_check)
+	ASM_CLAC
+	pushl	$do_alignment_check
+	jmp	error_code
+END(alignment_check)
+
+ENTRY(divide_error)
+	ASM_CLAC
+	pushl	$0				# no error code
+	pushl	$do_divide_error
+	jmp	error_code
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+	ASM_CLAC
+	pushl	$0
+	pushl	machine_check_vector
+	jmp	error_code
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+	ASM_CLAC
+	pushl	$0
+	pushl	$do_spurious_interrupt_bug
+	jmp	error_code
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+	addl	$5*4, %esp			/* remove xen-provided frame */
+	jmp	sysenter_past_esp
+
+ENTRY(xen_hypervisor_callback)
+	pushl	$-1				/* orig_ax = -1 => not a system call */
+	SAVE_ALL
+	TRACE_IRQS_OFF
+
+	/*
+	 * Check to see if we got the event in the critical
+	 * region in xen_iret_direct, after we've reenabled
+	 * events and checked for pending events.  This simulates
+	 * iret instruction's behaviour where it delivers a
+	 * pending interrupt when enabling interrupts:
+	 */
+	movl	PT_EIP(%esp), %eax
+	cmpl	$xen_iret_start_crit, %eax
+	jb	1f
+	cmpl	$xen_iret_end_crit, %eax
+	jae	1f
+
+	jmp	xen_iret_crit_fixup
+
+ENTRY(xen_do_upcall)
+1:	mov	%esp, %eax
+	call	xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+	call	xen_maybe_preempt_hcall
+#endif
+	jmp	ret_from_intr
+ENDPROC(xen_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ *  1. Fault while reloading DS, ES, FS or GS
+ *  2. Fault while executing IRET
+ * Category 1 we fix up by reattempting the load, and zeroing the segment
+ * register if the load fails.
+ * Category 2 we fix up by jumping to do_iret_error. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by maintaining a status value in EAX.
+ */
+ENTRY(xen_failsafe_callback)
+	pushl	%eax
+	movl	$1, %eax
+1:	mov	4(%esp), %ds
+2:	mov	8(%esp), %es
+3:	mov	12(%esp), %fs
+4:	mov	16(%esp), %gs
+	/* EAX == 0 => Category 1 (Bad segment)
+	   EAX != 0 => Category 2 (Bad IRET) */
+	testl	%eax, %eax
+	popl	%eax
+	lea	16(%esp), %esp
+	jz	5f
+	jmp	iret_exc
+5:	pushl	$-1				/* orig_ax = -1 => not a system call */
+	SAVE_ALL
+	jmp	ret_from_exception
+
+.section .fixup, "ax"
+6:	xorl	%eax, %eax
+	movl	%eax, 4(%esp)
+	jmp	1b
+7:	xorl	%eax, %eax
+	movl	%eax, 8(%esp)
+	jmp	2b
+8:	xorl	%eax, %eax
+	movl	%eax, 12(%esp)
+	jmp	3b
+9:	xorl	%eax, %eax
+	movl	%eax, 16(%esp)
+	jmp	4b
+.previous
+	_ASM_EXTABLE(1b, 6b)
+	_ASM_EXTABLE(2b, 7b)
+	_ASM_EXTABLE(3b, 8b)
+	_ASM_EXTABLE(4b, 9b)
+ENDPROC(xen_failsafe_callback)
+
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+		xen_evtchn_do_upcall)
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+	hyperv_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	pushl	$0				/* Pass NULL as regs pointer */
+	movl	4*4(%esp), %eax
+	movl	0x4(%ebp), %edx
+	movl	function_trace_op, %ecx
+	subl	$MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+	call	ftrace_stub
+
+	addl	$4, %esp			/* skip NULL pointer */
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+ftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+	jmp	ftrace_stub
+#endif
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+	pushf	/* push flags before compare (in cs location) */
+
+	/*
+	 * i386 does not save SS and ESP when coming from kernel.
+	 * Instead, to get sp, &regs->sp is used (see ptrace.h).
+	 * Unfortunately, that means eflags must be at the same location
+	 * as the current return ip is. We move the return ip into the
+	 * ip location, and move flags into the return ip location.
+	 */
+	pushl	4(%esp)				/* save return ip into ip slot */
+
+	pushl	$0				/* Load 0 into orig_ax */
+	pushl	%gs
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+
+	movl	13*4(%esp), %eax		/* Get the saved flags */
+	movl	%eax, 14*4(%esp)		/* Move saved flags into regs->flags location */
+						/* clobbering return ip */
+	movl	$__KERNEL_CS, 13*4(%esp)
+
+	movl	12*4(%esp), %eax		/* Load ip (1st parameter) */
+	subl	$MCOUNT_INSN_SIZE, %eax		/* Adjust ip */
+	movl	0x4(%ebp), %edx			/* Load parent ip (2nd parameter) */
+	movl	function_trace_op, %ecx		/* Save ftrace_pos in 3rd parameter */
+	pushl	%esp				/* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+	call	ftrace_stub
+
+	addl	$4, %esp			/* Skip pt_regs */
+	movl	14*4(%esp), %eax		/* Move flags back into cs */
+	movl	%eax, 13*4(%esp)		/* Needed to keep addl	from modifying flags */
+	movl	12*4(%esp), %eax		/* Get return ip from regs->ip */
+	movl	%eax, 14*4(%esp)		/* Put return ip back for ret */
+
+	popl	%ebx
+	popl	%ecx
+	popl	%edx
+	popl	%esi
+	popl	%edi
+	popl	%ebp
+	popl	%eax
+	popl	%ds
+	popl	%es
+	popl	%fs
+	popl	%gs
+	addl	$8, %esp			/* Skip orig_ax and ip */
+	popf					/* Pop flags at end (no addl to corrupt flags) */
+	jmp	ftrace_ret
+
+	popf
+	jmp	ftrace_stub
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+	cmpl	$__PAGE_OFFSET, %esp
+	jb	ftrace_stub			/* Paging not enabled yet? */
+
+	cmpl	$ftrace_stub, ftrace_trace_function
+	jnz	trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	cmpl	$ftrace_stub, ftrace_graph_return
+	jnz	ftrace_graph_caller
+
+	cmpl	$ftrace_graph_entry_stub, ftrace_graph_entry
+	jnz	ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	movl	0xc(%esp), %eax
+	movl	0x4(%ebp), %edx
+	subl	$MCOUNT_INSN_SIZE, %eax
+
+	call	*ftrace_trace_function
+
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+	jmp	ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+	pushl	%eax
+	pushl	%ecx
+	pushl	%edx
+	movl	0xc(%esp), %eax
+	lea	0x4(%ebp), %edx
+	movl	(%ebp), %ecx
+	subl	$MCOUNT_INSN_SIZE, %eax
+	call	prepare_ftrace_return
+	popl	%edx
+	popl	%ecx
+	popl	%eax
+	ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+	pushl	%eax
+	pushl	%edx
+	movl	%ebp, %eax
+	call	ftrace_return_to_handler
+	movl	%eax, %ecx
+	popl	%edx
+	popl	%eax
+	jmp	*%ecx
+#endif
+
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+	ASM_CLAC
+	pushl	$trace_do_page_fault
+	jmp	error_code
+END(trace_page_fault)
+#endif
+
+ENTRY(page_fault)
+	ASM_CLAC
+	pushl	$do_page_fault
+	ALIGN
+error_code:
+	/* the function address is in %gs's slot on the stack */
+	pushl	%fs
+	pushl	%es
+	pushl	%ds
+	pushl	%eax
+	pushl	%ebp
+	pushl	%edi
+	pushl	%esi
+	pushl	%edx
+	pushl	%ecx
+	pushl	%ebx
+	cld
+	movl	$(__KERNEL_PERCPU), %ecx
+	movl	%ecx, %fs
+	UNWIND_ESPFIX_STACK
+	GS_TO_REG %ecx
+	movl	PT_GS(%esp), %edi		# get the function address
+	movl	PT_ORIG_EAX(%esp), %edx		# get the error code
+	movl	$-1, PT_ORIG_EAX(%esp)		# no syscall to restart
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
+	movl	$(__USER_DS), %ecx
+	movl	%ecx, %ds
+	movl	%ecx, %es
+	TRACE_IRQS_OFF
+	movl	%esp, %eax			# pt_regs pointer
+	call	*%edi
+	jmp	ret_from_exception
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+	cmpw	$__KERNEL_CS, 4(%esp)
+	jne	\ok
+\label:
+	movl	TSS_sysenter_sp0 + \offset(%esp), %esp
+	pushfl
+	pushl	$__KERNEL_CS
+	pushl	$sysenter_past_esp
+.endm
+
+ENTRY(debug)
+	ASM_CLAC
+	cmpl	$entry_SYSENTER_32, (%esp)
+	jne	debug_stack_correct
+	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+	pushl	$-1				# mark this as an int
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl	%edx, %edx			# error code 0
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_debug
+	jmp	ret_from_exception
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+	ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
+	pushl	%eax
+	movl	%ss, %eax
+	cmpw	$__ESPFIX_SS, %ax
+	popl	%eax
+	je	nmi_espfix_stack
+#endif
+	cmpl	$entry_SYSENTER_32, (%esp)
+	je	nmi_stack_fixup
+	pushl	%eax
+	movl	%esp, %eax
+	/*
+	 * Do not access memory above the end of our stack page,
+	 * it might not exist.
+	 */
+	andl	$(THREAD_SIZE-1), %eax
+	cmpl	$(THREAD_SIZE-20), %eax
+	popl	%eax
+	jae	nmi_stack_correct
+	cmpl	$entry_SYSENTER_32, 12(%esp)
+	je	nmi_debug_stack_check
+nmi_stack_correct:
+	pushl	%eax
+	SAVE_ALL
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_nmi
+	jmp	restore_all_notrace
+
+nmi_stack_fixup:
+	FIX_STACK 12, nmi_stack_correct, 1
+	jmp	nmi_stack_correct
+
+nmi_debug_stack_check:
+	cmpw	$__KERNEL_CS, 16(%esp)
+	jne	nmi_stack_correct
+	cmpl	$debug, (%esp)
+	jb	nmi_stack_correct
+	cmpl	$debug_esp_fix_insn, (%esp)
+	ja	nmi_stack_correct
+	FIX_STACK 24, nmi_stack_correct, 1
+	jmp	nmi_stack_correct
+
+#ifdef CONFIG_X86_ESPFIX32
+nmi_espfix_stack:
+	/*
+	 * create the pointer to lss back
+	 */
+	pushl	%ss
+	pushl	%esp
+	addl	$4, (%esp)
+	/* copy the iret frame of 12 bytes */
+	.rept 3
+	pushl	16(%esp)
+	.endr
+	pushl	%eax
+	SAVE_ALL
+	FIXUP_ESPFIX_STACK			# %eax == %esp
+	xorl	%edx, %edx			# zero error code
+	call	do_nmi
+	RESTORE_REGS
+	lss	12+4(%esp), %esp		# back to espfix stack
+	jmp	irq_return
+#endif
+END(nmi)
+
+ENTRY(int3)
+	ASM_CLAC
+	pushl	$-1				# mark this as an int
+	SAVE_ALL
+	TRACE_IRQS_OFF
+	xorl	%edx, %edx			# zero error code
+	movl	%esp, %eax			# pt_regs pointer
+	call	do_int3
+	jmp	ret_from_exception
+END(int3)
+
+ENTRY(general_protection)
+	pushl	$do_general_protection
+	jmp	error_code
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+	ASM_CLAC
+	pushl	$do_async_page_fault
+	jmp	error_code
+END(async_page_fault)
+#endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/entry/entry_64.S
index 02c2eff7478d..3bb2c4302df1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -4,34 +4,25 @@
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
  *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
- */
-
-/*
+ *
  * entry.S contains the system-call and fault low-level handling routines.
  *
  * Some of this is documented in Documentation/x86/entry_64.txt
  *
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
  * A note on terminology:
- * - iret frame: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ * - iret frame:	Architecture defined interrupt frame from SS to RIP
+ *			at the top of the kernel process stack.
  *
  * Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - ENTRY/END Define functions in the symbol table.
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - idtentry - Define exception entry points.
+ * - ENTRY/END:		Define functions in the symbol table.
+ * - TRACE_IRQ_*:	Trace hardirq state for lock debugging.
+ * - idtentry:		Define exception entry points.
  */
-
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
@@ -49,13 +40,12 @@
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
 #include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE	   0x40000000
-
-	.code64
-	.section .entry.text, "ax"
+#define AUDIT_ARCH_X86_64			(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT			0x80000000
+#define __AUDIT_ARCH_LE				0x40000000
 
+.code64
+.section .entry.text, "ax"
 
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret64)
@@ -64,11 +54,10 @@ ENTRY(native_usergs_sysret64)
 ENDPROC(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
-
 .macro TRACE_IRQS_IRETQ
 #ifdef CONFIG_TRACE_IRQFLAGS
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  1f
+	bt	$9, EFLAGS(%rsp)		/* interrupts off? */
+	jnc	1f
 	TRACE_IRQS_ON
 1:
 #endif
@@ -88,89 +77,34 @@ ENDPROC(native_usergs_sysret64)
 #if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
 
 .macro TRACE_IRQS_OFF_DEBUG
-	call debug_stack_set_zero
+	call	debug_stack_set_zero
 	TRACE_IRQS_OFF
-	call debug_stack_reset
+	call	debug_stack_reset
 .endm
 
 .macro TRACE_IRQS_ON_DEBUG
-	call debug_stack_set_zero
+	call	debug_stack_set_zero
 	TRACE_IRQS_ON
-	call debug_stack_reset
+	call	debug_stack_reset
 .endm
 
 .macro TRACE_IRQS_IRETQ_DEBUG
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  1f
+	bt	$9, EFLAGS(%rsp)		/* interrupts off? */
+	jnc	1f
 	TRACE_IRQS_ON_DEBUG
 1:
 .endm
 
 #else
-# define TRACE_IRQS_OFF_DEBUG		TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG		TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG		TRACE_IRQS_IRETQ
+# define TRACE_IRQS_OFF_DEBUG			TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG			TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG			TRACE_IRQS_IRETQ
 #endif
 
 /*
- * empty frame
- */
-	.macro EMPTY_FRAME start=1 offset=0
-	.if \start
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA rsp,8+\offset
-	.else
-	CFI_DEF_CFA_OFFSET 8+\offset
-	.endif
-	.endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
-	.macro INTR_FRAME start=1 offset=0
-	EMPTY_FRAME \start, 5*8+\offset
-	/*CFI_REL_OFFSET ss, 4*8+\offset*/
-	CFI_REL_OFFSET rsp, 3*8+\offset
-	/*CFI_REL_OFFSET rflags, 2*8+\offset*/
-	/*CFI_REL_OFFSET cs, 1*8+\offset*/
-	CFI_REL_OFFSET rip, 0*8+\offset
-	.endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
-	.macro XCPT_FRAME start=1 offset=0
-	INTR_FRAME \start, 1*8+\offset
-	.endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
-	.macro DEFAULT_FRAME start=1 offset=0
-	XCPT_FRAME \start, ORIG_RAX+\offset
-	CFI_REL_OFFSET rdi, RDI+\offset
-	CFI_REL_OFFSET rsi, RSI+\offset
-	CFI_REL_OFFSET rdx, RDX+\offset
-	CFI_REL_OFFSET rcx, RCX+\offset
-	CFI_REL_OFFSET rax, RAX+\offset
-	CFI_REL_OFFSET r8, R8+\offset
-	CFI_REL_OFFSET r9, R9+\offset
-	CFI_REL_OFFSET r10, R10+\offset
-	CFI_REL_OFFSET r11, R11+\offset
-	CFI_REL_OFFSET rbx, RBX+\offset
-	CFI_REL_OFFSET rbp, RBP+\offset
-	CFI_REL_OFFSET r12, R12+\offset
-	CFI_REL_OFFSET r13, R13+\offset
-	CFI_REL_OFFSET r14, R14+\offset
-	CFI_REL_OFFSET r15, R15+\offset
-	.endm
-
-/*
- * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
- * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
  * then loads new ss, cs, and rip from previously programmed MSRs.
  * rflags gets masked by a value from another MSR (so CLD and CLAC
  * are not needed). SYSCALL does not save anything on the stack
@@ -186,7 +120,7 @@ ENDPROC(native_usergs_sysret64)
  * r10  arg3 (needs to be moved to rcx to conform to C ABI)
  * r8   arg4
  * r9   arg5
- * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
  *
  * Only called from user space.
  *
@@ -195,13 +129,7 @@ ENDPROC(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  */
 
-ENTRY(system_call)
-	CFI_STARTPROC	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rip,rcx
-	/*CFI_REGISTER	rflags,r11*/
-
+ENTRY(entry_SYSCALL_64)
 	/*
 	 * Interrupts are off on entry.
 	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -213,14 +141,14 @@ ENTRY(system_call)
 	 * after the swapgs, so that it can do the swapgs
 	 * for the guest and jump here on syscall.
 	 */
-GLOBAL(system_call_after_swapgs)
+GLOBAL(entry_SYSCALL_64_after_swapgs)
 
-	movq	%rsp,PER_CPU_VAR(rsp_scratch)
-	movq	PER_CPU_VAR(kernel_stack),%rsp
+	movq	%rsp, PER_CPU_VAR(rsp_scratch)
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/* Construct struct pt_regs on stack */
-	pushq_cfi $__USER_DS			/* pt_regs->ss */
-	pushq_cfi PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
+	pushq	$__USER_DS			/* pt_regs->ss */
+	pushq	PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
 	/*
 	 * Re-enable interrupts.
 	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
@@ -229,36 +157,34 @@ GLOBAL(system_call_after_swapgs)
 	 * with using rsp_scratch:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi	%r11			/* pt_regs->flags */
-	pushq_cfi	$__USER_CS		/* pt_regs->cs */
-	pushq_cfi	%rcx			/* pt_regs->ip */
-	CFI_REL_OFFSET rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
-	pushq_cfi_reg	r8			/* pt_regs->r8 */
-	pushq_cfi_reg	r9			/* pt_regs->r9 */
-	pushq_cfi_reg	r10			/* pt_regs->r10 */
-	pushq_cfi_reg	r11			/* pt_regs->r11 */
-	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 6*8
-
-	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz tracesys
-system_call_fastpath:
+	pushq	%r11				/* pt_regs->flags */
+	pushq	$__USER_CS			/* pt_regs->cs */
+	pushq	%rcx				/* pt_regs->ip */
+	pushq	%rax				/* pt_regs->orig_ax */
+	pushq	%rdi				/* pt_regs->di */
+	pushq	%rsi				/* pt_regs->si */
+	pushq	%rdx				/* pt_regs->dx */
+	pushq	%rcx				/* pt_regs->cx */
+	pushq	$-ENOSYS			/* pt_regs->ax */
+	pushq	%r8				/* pt_regs->r8 */
+	pushq	%r9				/* pt_regs->r9 */
+	pushq	%r10				/* pt_regs->r10 */
+	pushq	%r11				/* pt_regs->r11 */
+	sub	$(6*8), %rsp			/* pt_regs->bp, bx, r12-15 not saved */
+
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	tracesys
+entry_SYSCALL_64_fastpath:
 #if __SYSCALL_MASK == ~0
-	cmpq $__NR_syscall_max,%rax
+	cmpq	$__NR_syscall_max, %rax
 #else
-	andl $__SYSCALL_MASK,%eax
-	cmpl $__NR_syscall_max,%eax
+	andl	$__SYSCALL_MASK, %eax
+	cmpl	$__NR_syscall_max, %eax
 #endif
-	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
-	movq %r10,%rcx
-	call *sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
+	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	movq	%r10, %rcx
+	call	*sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
 1:
 /*
  * Syscall return path ending with SYSRET (fast path).
@@ -279,19 +205,15 @@ system_call_fastpath:
 	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
 	 * very bad.
 	 */
-	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz int_ret_from_sys_call_irqs_off	/* Go to the slow path */
-
-	CFI_REMEMBER_STATE
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
 	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RIP(%rsp),%rcx
-	CFI_REGISTER	rip,rcx
-	movq	EFLAGS(%rsp),%r11
-	/*CFI_REGISTER	rflags,r11*/
-	movq	RSP(%rsp),%rsp
+	movq	RIP(%rsp), %rcx
+	movq	EFLAGS(%rsp), %r11
+	movq	RSP(%rsp), %rsp
 	/*
-	 * 64bit SYSRET restores rip from rcx,
+	 * 64-bit SYSRET restores rip from rcx,
 	 * rflags from r11 (but RF and VM bits are forced to 0),
 	 * cs and ss are loaded from MSRs.
 	 * Restoration of rflags re-enables interrupts.
@@ -307,25 +229,23 @@ system_call_fastpath:
 	 */
 	USERGS_SYSRET64
 
-	CFI_RESTORE_STATE
-
 	/* Do syscall entry tracing */
 tracesys:
-	movq %rsp, %rdi
-	movl $AUDIT_ARCH_X86_64, %esi
-	call syscall_trace_enter_phase1
-	test %rax, %rax
-	jnz tracesys_phase2		/* if needed, run the slow path */
-	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
-	movq ORIG_RAX(%rsp), %rax
-	jmp system_call_fastpath	/*      and return to the fast path */
+	movq	%rsp, %rdi
+	movl	$AUDIT_ARCH_X86_64, %esi
+	call	syscall_trace_enter_phase1
+	test	%rax, %rax
+	jnz	tracesys_phase2			/* if needed, run the slow path */
+	RESTORE_C_REGS_EXCEPT_RAX		/* else restore clobbered regs */
+	movq	ORIG_RAX(%rsp), %rax
+	jmp	entry_SYSCALL_64_fastpath	/* and return to the fast path */
 
 tracesys_phase2:
 	SAVE_EXTRA_REGS
-	movq %rsp, %rdi
-	movl $AUDIT_ARCH_X86_64, %esi
-	movq %rax,%rdx
-	call syscall_trace_enter_phase2
+	movq	%rsp, %rdi
+	movl	$AUDIT_ARCH_X86_64, %esi
+	movq	%rax, %rdx
+	call	syscall_trace_enter_phase2
 
 	/*
 	 * Reload registers from stack in case ptrace changed them.
@@ -335,15 +255,15 @@ tracesys_phase2:
 	RESTORE_C_REGS_EXCEPT_RAX
 	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
-	cmpq $__NR_syscall_max,%rax
+	cmpq	$__NR_syscall_max, %rax
 #else
-	andl $__SYSCALL_MASK,%eax
-	cmpl $__NR_syscall_max,%eax
+	andl	$__SYSCALL_MASK, %eax
+	cmpl	$__NR_syscall_max, %eax
 #endif
-	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
-	movq %r10,%rcx	/* fixup for C */
-	call *sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
+	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	movq	%r10, %rcx			/* fixup for C */
+	call	*sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
 1:
 	/* Use IRET because user could have changed pt_regs->foo */
 
@@ -355,31 +275,33 @@ GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
 	TRACE_IRQS_OFF
-	movl $_TIF_ALLWORK_MASK,%edi
+	movl	$_TIF_ALLWORK_MASK, %edi
 	/* edi:	mask to check */
 GLOBAL(int_with_check)
 	LOCKDEP_SYS_EXIT_IRQ
 	GET_THREAD_INFO(%rcx)
-	movl TI_flags(%rcx),%edx
-	andl %edi,%edx
-	jnz   int_careful
-	andl	$~TS_COMPAT,TI_status(%rcx)
+	movl	TI_flags(%rcx), %edx
+	andl	%edi, %edx
+	jnz	int_careful
+	andl	$~TS_COMPAT, TI_status(%rcx)
 	jmp	syscall_return
 
-	/* Either reschedule or signal or syscall exit tracking needed. */
-	/* First do a reschedule test. */
-	/* edx:	work, edi: workmask */
+	/*
+	 * Either reschedule or signal or syscall exit tracking needed.
+	 * First do a reschedule test.
+	 * edx:	work, edi: workmask
+	 */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	bt	$TIF_NEED_RESCHED, %edx
+	jnc	int_very_careful
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
+	pushq	%rdi
 	SCHEDULE_USER
-	popq_cfi %rdi
+	popq	%rdi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	jmp int_with_check
+	jmp	int_with_check
 
 	/* handle signals and tracing -- both require a full pt_regs */
 int_very_careful:
@@ -387,27 +309,27 @@ int_very_careful:
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
 	/* Check for syscall exit trace */
-	testl $_TIF_WORK_SYSCALL_EXIT,%edx
-	jz int_signal
-	pushq_cfi %rdi
-	leaq 8(%rsp),%rdi	# &ptregs -> arg1
-	call syscall_trace_leave
-	popq_cfi %rdi
-	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
-	jmp int_restore_rest
+	testl	$_TIF_WORK_SYSCALL_EXIT, %edx
+	jz	int_signal
+	pushq	%rdi
+	leaq	8(%rsp), %rdi			/* &ptregs -> arg1 */
+	call	syscall_trace_leave
+	popq	%rdi
+	andl	$~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
+	jmp	int_restore_rest
 
 int_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz 1f
-	movq %rsp,%rdi		# &ptregs -> arg1
-	xorl %esi,%esi		# oldset -> arg2
-	call do_notify_resume
-1:	movl $_TIF_WORK_MASK,%edi
+	testl	$_TIF_DO_NOTIFY_MASK, %edx
+	jz	1f
+	movq	%rsp, %rdi			/* &ptregs -> arg1 */
+	xorl	%esi, %esi			/* oldset -> arg2 */
+	call	do_notify_resume
+1:	movl	$_TIF_WORK_MASK, %edi
 int_restore_rest:
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	jmp int_with_check
+	jmp	int_with_check
 
 syscall_return:
 	/* The IRETQ could re-enable interrupts: */
@@ -418,34 +340,37 @@ syscall_return:
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * a completely clean 64-bit userspace context.
 	 */
-	movq RCX(%rsp),%rcx
-	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
-	jne opportunistic_sysret_failed
+	movq	RCX(%rsp), %rcx
+	movq	RIP(%rsp), %r11
+	cmpq	%rcx, %r11			/* RCX == RIP */
+	jne	opportunistic_sysret_failed
 
 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
 	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.  It's not worth
-	 * testing for canonicalness exactly -- this check detects any
-	 * of the 17 high bits set, which is true for non-canonical
-	 * or kernel addresses.  (This will pessimize vsyscall=native.
-	 * Big deal.)
+	 * the kernel, since userspace controls RSP.
 	 *
-	 * If virtual addresses ever become wider, this will need
+	 * If width of "canonical tail" ever becomes variable, this will need
 	 * to be updated to remain correct on both old and new CPUs.
 	 */
 	.ifne __VIRTUAL_MASK_SHIFT - 47
 	.error "virtual address width changed -- SYSRET checks need update"
 	.endif
-	shr $__VIRTUAL_MASK_SHIFT, %rcx
-	jnz opportunistic_sysret_failed
 
-	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
-	jne opportunistic_sysret_failed
+	/* Change top 16 bits to be the sign-extension of 47th bit */
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
 
-	movq R11(%rsp),%r11
-	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne opportunistic_sysret_failed
+	/* If this changed %rcx, it was not canonical */
+	cmpq	%rcx, %r11
+	jne	opportunistic_sysret_failed
+
+	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
+	jne	opportunistic_sysret_failed
+
+	movq	R11(%rsp), %r11
+	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
+	jne	opportunistic_sysret_failed
 
 	/*
 	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
@@ -454,47 +379,41 @@ syscall_return:
 	 * with register state that satisfies the opportunistic SYSRET
 	 * conditions.  For example, single-stepping this user code:
 	 *
-	 *           movq $stuck_here,%rcx
+	 *           movq	$stuck_here, %rcx
 	 *           pushfq
 	 *           popq %r11
 	 *   stuck_here:
 	 *
 	 * would never get past 'stuck_here'.
 	 */
-	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz opportunistic_sysret_failed
+	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+	jnz	opportunistic_sysret_failed
 
 	/* nothing to check for RSP */
 
-	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
-	jne opportunistic_sysret_failed
+	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
+	jne	opportunistic_sysret_failed
 
 	/*
-	 * We win!  This label is here just for ease of understanding
-	 * perf profiles.  Nothing jumps here.
+	 * We win! This label is here just for ease of understanding
+	 * perf profiles. Nothing jumps here.
 	 */
 syscall_return_via_sysret:
-	CFI_REMEMBER_STATE
-	/* r11 is already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_R11
-	movq RSP(%rsp),%rsp
+	/* rcx and r11 are already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
-	CFI_RESTORE_STATE
 
 opportunistic_sysret_failed:
 	SWAPGS
 	jmp	restore_c_regs_and_iret
-	CFI_ENDPROC
-END(system_call)
+END(entry_SYSCALL_64)
 
 
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
-	jmp sys_\func
-	CFI_ENDPROC
+	jmp	sys_\func
 END(stub_\func)
 	.endm
 
@@ -503,8 +422,6 @@ END(stub_\func)
 	FORK_LIKE  vfork
 
 ENTRY(stub_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	sys_execve
 return_from_execve:
 	testl	%eax, %eax
@@ -514,11 +431,9 @@ return_from_execve:
 1:
 	/* must use IRET code path (pt_regs->cs may have changed) */
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	ZERO_EXTRA_REGS
-	movq	%rax,RAX(%rsp)
+	movq	%rax, RAX(%rsp)
 	jmp	int_ret_from_sys_call
-	CFI_ENDPROC
 END(stub_execve)
 /*
  * Remaining execve stubs are only 7 bytes long.
@@ -526,47 +441,25 @@ END(stub_execve)
  */
 	.align	8
 GLOBAL(stub_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	call	sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub_execveat)
 
-#ifdef CONFIG_X86_X32_ABI
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
 	.align	8
 GLOBAL(stub_x32_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execve
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execve)
-	.align	8
-GLOBAL(stub_x32_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execveat
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execveat)
-#endif
-
-#ifdef CONFIG_IA32_EMULATION
-	.align	8
 GLOBAL(stub32_execve)
-	CFI_STARTPROC
 	call	compat_sys_execve
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub32_execve)
+END(stub_x32_execve)
 	.align	8
+GLOBAL(stub_x32_execveat)
 GLOBAL(stub32_execveat)
-	CFI_STARTPROC
 	call	compat_sys_execveat
 	jmp	return_from_execve
-	CFI_ENDPROC
 END(stub32_execveat)
+END(stub_x32_execveat)
 #endif
 
 /*
@@ -574,8 +467,6 @@ END(stub32_execveat)
  * This cannot be done with SYSRET, so use the IRET return path instead.
  */
 ENTRY(stub_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	/*
 	 * SAVE_EXTRA_REGS result is not normally needed:
 	 * sigreturn overwrites all pt_regs->GPREGS.
@@ -584,24 +475,19 @@ ENTRY(stub_rt_sigreturn)
 	 * we SAVE_EXTRA_REGS here.
 	 */
 	SAVE_EXTRA_REGS 8
-	call sys_rt_sigreturn
+	call	sys_rt_sigreturn
 return_from_stub:
 	addq	$8, %rsp
-	CFI_ADJUST_CFA_OFFSET -8
 	RESTORE_EXTRA_REGS
-	movq %rax,RAX(%rsp)
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
+	movq	%rax, RAX(%rsp)
+	jmp	int_ret_from_sys_call
 END(stub_rt_sigreturn)
 
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
 	SAVE_EXTRA_REGS 8
-	call sys32_x32_rt_sigreturn
-	jmp  return_from_stub
-	CFI_ENDPROC
+	call	sys32_x32_rt_sigreturn
+	jmp	return_from_stub
 END(stub_x32_rt_sigreturn)
 #endif
 
@@ -611,36 +497,36 @@ END(stub_x32_rt_sigreturn)
  * rdi: prev task we switched from
  */
 ENTRY(ret_from_fork)
-	DEFAULT_FRAME
 
-	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+	LOCK ; btr $TIF_FORK, TI_flags(%r8)
 
-	pushq_cfi $0x0002
-	popfq_cfi				# reset kernel eflags
+	pushq	$0x0002
+	popfq					/* reset kernel eflags */
 
-	call schedule_tail			# rdi: 'prev' task parameter
+	call	schedule_tail			/* rdi: 'prev' task parameter */
 
 	RESTORE_EXTRA_REGS
 
-	testl $3,CS(%rsp)			# from kernel_thread?
+	testb	$3, CS(%rsp)			/* from kernel_thread? */
 
 	/*
 	 * By the time we get here, we have no idea whether our pt_regs,
 	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
-	 * the slow path, or one of the ia32entry paths.
+	 * the slow path, or one of the 32-bit compat paths.
 	 * Use IRET code path to return, since it can safely handle
 	 * all of the above.
 	 */
 	jnz	int_ret_from_sys_call
 
-	/* We came from kernel_thread */
-	/* nb: we depend on RESTORE_EXTRA_REGS above */
-	movq %rbp, %rdi
-	call *%rbx
-	movl $0, RAX(%rsp)
+	/*
+	 * We came from kernel_thread
+	 * nb: we depend on RESTORE_EXTRA_REGS above
+	 */
+	movq	%rbp, %rdi
+	call	*%rbx
+	movl	$0, RAX(%rsp)
 	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
+	jmp	int_ret_from_sys_call
 END(ret_from_fork)
 
 /*
@@ -649,16 +535,13 @@ END(ret_from_fork)
  */
 	.align 8
 ENTRY(irq_entries_start)
-	INTR_FRAME
     vector=FIRST_EXTERNAL_VECTOR
     .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+	pushq	$(~vector+0x80)			/* Note: always in signed byte range */
     vector=vector+1
 	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -8
 	.align	8
     .endr
-	CFI_ENDPROC
 END(irq_entries_start)
 
 /*
@@ -684,10 +567,10 @@ END(irq_entries_start)
 	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
 	SAVE_EXTRA_REGS_RBP -RBP
 
-	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
+	leaq	-RBP(%rsp), %rdi		/* arg1 for \func (pointer to pt_regs) */
 
-	testl $3, CS-RBP(%rsp)
-	je 1f
+	testb	$3, CS-RBP(%rsp)
+	jz	1f
 	SWAPGS
 1:
 	/*
@@ -697,24 +580,14 @@ END(irq_entries_start)
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-	movq %rsp, %rsi
-	incl PER_CPU_VAR(irq_count)
-	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	CFI_DEF_CFA_REGISTER	rsi
-	pushq %rsi
-	/*
-	 * For debugger:
-	 * "CFA (Current Frame Address) is the value on stack + offset"
-	 */
-	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
-			0x77 /* DW_OP_breg7 (rsp) */, 0, \
-			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
-			0x22 /* DW_OP_plus */
+	movq	%rsp, %rsi
+	incl	PER_CPU_VAR(irq_count)
+	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
+	pushq	%rsi
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
-	call \func
+	call	\func
 	.endm
 
 	/*
@@ -723,42 +596,36 @@ END(irq_entries_start)
 	 */
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 common_interrupt:
-	XCPT_FRAME
 	ASM_CLAC
-	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
+	addq	$-0x80, (%rsp)			/* Adjust vector to [-256, -1] range */
 	interrupt do_IRQ
 	/* 0(%rsp): old RSP */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	decl PER_CPU_VAR(irq_count)
+	decl	PER_CPU_VAR(irq_count)
 
 	/* Restore saved previous stack */
-	popq %rsi
-	CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+	popq	%rsi
 	/* return code expects complete pt_regs - adjust rsp accordingly: */
-	leaq -RBP(%rsi),%rsp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	RBP
+	leaq	-RBP(%rsi), %rsp
 
-	testl $3,CS(%rsp)
-	je retint_kernel
+	testb	$3, CS(%rsp)
+	jz	retint_kernel
 	/* Interrupt came from user space */
-
+retint_user:
 	GET_THREAD_INFO(%rcx)
-	/*
-	 * %rcx: thread info. Interrupts off.
-	 */
+
+	/* %rcx: thread info. Interrupts are off. */
 retint_with_reschedule:
-	movl $_TIF_WORK_MASK,%edi
+	movl	$_TIF_WORK_MASK, %edi
 retint_check:
 	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	andl %edi,%edx
-	CFI_REMEMBER_STATE
-	jnz  retint_careful
+	movl	TI_flags(%rcx), %edx
+	andl	%edi, %edx
+	jnz	retint_careful
 
-retint_swapgs:		/* return to user-space */
+retint_swapgs:					/* return to user-space */
 	/*
 	 * The iretq could re-enable interrupts:
 	 */
@@ -773,9 +640,9 @@ retint_kernel:
 #ifdef CONFIG_PREEMPT
 	/* Interrupts are off */
 	/* Check if we need preemption */
-	bt	$9,EFLAGS(%rsp)	/* interrupts were off? */
+	bt	$9, EFLAGS(%rsp)		/* were interrupts off? */
 	jnc	1f
-0:	cmpl	$0,PER_CPU_VAR(__preempt_count)
+0:	cmpl	$0, PER_CPU_VAR(__preempt_count)
 	jnz	1f
 	call	preempt_schedule_irq
 	jmp	0b
@@ -793,8 +660,6 @@ retint_kernel:
 restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
-
-irq_return:
 	INTERRUPT_RETURN
 
 ENTRY(native_iret)
@@ -803,8 +668,8 @@ ENTRY(native_iret)
 	 * 64-bit mode SS:RSP on the exception stack is always valid.
 	 */
 #ifdef CONFIG_X86_ESPFIX64
-	testb $4,(SS-RIP)(%rsp)
-	jnz native_irq_return_ldt
+	testb	$4, (SS-RIP)(%rsp)
+	jnz	native_irq_return_ldt
 #endif
 
 .global native_irq_return_iret
@@ -819,62 +684,60 @@ native_irq_return_iret:
 
 #ifdef CONFIG_X86_ESPFIX64
 native_irq_return_ldt:
-	pushq_cfi %rax
-	pushq_cfi %rdi
+	pushq	%rax
+	pushq	%rdi
 	SWAPGS
-	movq PER_CPU_VAR(espfix_waddr),%rdi
-	movq %rax,(0*8)(%rdi)	/* RAX */
-	movq (2*8)(%rsp),%rax	/* RIP */
-	movq %rax,(1*8)(%rdi)
-	movq (3*8)(%rsp),%rax	/* CS */
-	movq %rax,(2*8)(%rdi)
-	movq (4*8)(%rsp),%rax	/* RFLAGS */
-	movq %rax,(3*8)(%rdi)
-	movq (6*8)(%rsp),%rax	/* SS */
-	movq %rax,(5*8)(%rdi)
-	movq (5*8)(%rsp),%rax	/* RSP */
-	movq %rax,(4*8)(%rdi)
-	andl $0xffff0000,%eax
-	popq_cfi %rdi
-	orq PER_CPU_VAR(espfix_stack),%rax
+	movq	PER_CPU_VAR(espfix_waddr), %rdi
+	movq	%rax, (0*8)(%rdi)		/* RAX */
+	movq	(2*8)(%rsp), %rax		/* RIP */
+	movq	%rax, (1*8)(%rdi)
+	movq	(3*8)(%rsp), %rax		/* CS */
+	movq	%rax, (2*8)(%rdi)
+	movq	(4*8)(%rsp), %rax		/* RFLAGS */
+	movq	%rax, (3*8)(%rdi)
+	movq	(6*8)(%rsp), %rax		/* SS */
+	movq	%rax, (5*8)(%rdi)
+	movq	(5*8)(%rsp), %rax		/* RSP */
+	movq	%rax, (4*8)(%rdi)
+	andl	$0xffff0000, %eax
+	popq	%rdi
+	orq	PER_CPU_VAR(espfix_stack), %rax
 	SWAPGS
-	movq %rax,%rsp
-	popq_cfi %rax
-	jmp native_irq_return_iret
+	movq	%rax, %rsp
+	popq	%rax
+	jmp	native_irq_return_iret
 #endif
 
 	/* edi: workmask, edx: work */
 retint_careful:
-	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	bt	$TIF_NEED_RESCHED, %edx
+	jnc	retint_signal
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq_cfi %rdi
+	pushq	%rdi
 	SCHEDULE_USER
-	popq_cfi %rdi
+	popq	%rdi
 	GET_THREAD_INFO(%rcx)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	jmp retint_check
+	jmp	retint_check
 
 retint_signal:
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz    retint_swapgs
+	testl	$_TIF_DO_NOTIFY_MASK, %edx
+	jz	retint_swapgs
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
-	movq $-1,ORIG_RAX(%rsp)
-	xorl %esi,%esi		# oldset
-	movq %rsp,%rdi		# &pt_regs
-	call do_notify_resume
+	movq	$-1, ORIG_RAX(%rsp)
+	xorl	%esi, %esi			/* oldset */
+	movq	%rsp, %rdi			/* &pt_regs */
+	call	do_notify_resume
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
-	jmp retint_with_reschedule
+	jmp	retint_with_reschedule
 
-	CFI_ENDPROC
 END(common_interrupt)
 
 /*
@@ -882,13 +745,11 @@ END(common_interrupt)
  */
 .macro apicinterrupt3 num sym do_sym
 ENTRY(\sym)
-	INTR_FRAME
 	ASM_CLAC
-	pushq_cfi $~(\num)
+	pushq	$~(\num)
 .Lcommon_\sym:
 	interrupt \do_sym
-	jmp ret_from_intr
-	CFI_ENDPROC
+	jmp	ret_from_intr
 END(\sym)
 .endm
 
@@ -910,53 +771,45 @@ trace_apicinterrupt \num \sym
 .endm
 
 #ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
-	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR \
-	reboot_interrupt smp_reboot_interrupt
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR		irq_move_cleanup_interrupt	smp_irq_move_cleanup_interrupt
+apicinterrupt3 REBOOT_VECTOR			reboot_interrupt		smp_reboot_interrupt
 #endif
 
 #ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE \
-	uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt3 UV_BAU_MESSAGE			uv_bau_message_intr1		uv_bau_message_interrupt
 #endif
-apicinterrupt LOCAL_TIMER_VECTOR \
-	apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR \
-	x86_platform_ipi smp_x86_platform_ipi
+
+apicinterrupt LOCAL_TIMER_VECTOR		apic_timer_interrupt		smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR		x86_platform_ipi		smp_x86_platform_ipi
 
 #ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR \
-	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_VECTOR		kvm_posted_intr_ipi		smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR	kvm_posted_intr_wakeup_ipi	smp_kvm_posted_intr_wakeup_ipi
 #endif
 
 #ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR \
-	threshold_interrupt smp_threshold_interrupt
+apicinterrupt THRESHOLD_APIC_VECTOR		threshold_interrupt		smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+apicinterrupt DEFERRED_ERROR_VECTOR		deferred_error_interrupt	smp_deferred_error_interrupt
 #endif
 
 #ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR \
-	thermal_interrupt smp_thermal_interrupt
+apicinterrupt THERMAL_APIC_VECTOR		thermal_interrupt		smp_thermal_interrupt
 #endif
 
 #ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
-	call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR \
-	call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR \
-	reschedule_interrupt smp_reschedule_interrupt
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR	call_function_single_interrupt	smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR		call_function_interrupt		smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR			reschedule_interrupt		smp_reschedule_interrupt
 #endif
 
-apicinterrupt ERROR_APIC_VECTOR \
-	error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR \
-	spurious_interrupt smp_spurious_interrupt
+apicinterrupt ERROR_APIC_VECTOR			error_interrupt			smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR		spurious_interrupt		smp_spurious_interrupt
 
 #ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR \
-	irq_work_interrupt smp_irq_work_interrupt
+apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 #endif
 
 /*
@@ -971,100 +824,87 @@ ENTRY(\sym)
 	.error "using shift_ist requires paranoid=1"
 	.endif
 
-	.if \has_error_code
-	XCPT_FRAME
-	.else
-	INTR_FRAME
-	.endif
-
 	ASM_CLAC
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 
 	.ifeq \has_error_code
-	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */
+	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
 	ALLOC_PT_GPREGS_ON_STACK
 
 	.if \paranoid
 	.if \paranoid == 1
-	CFI_REMEMBER_STATE
-	testl $3, CS(%rsp)		/* If coming from userspace, switch */
-	jnz 1f				/* stacks. */
+	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
+	jnz	1f
 	.endif
-	call paranoid_entry
+	call	paranoid_entry
 	.else
-	call error_entry
+	call	error_entry
 	.endif
 	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
-	DEFAULT_FRAME 0
-
 	.if \paranoid
 	.if \shift_ist != -1
-	TRACE_IRQS_OFF_DEBUG		/* reload IDT in case of recursion */
+	TRACE_IRQS_OFF_DEBUG			/* reload IDT in case of recursion */
 	.else
 	TRACE_IRQS_OFF
 	.endif
 	.endif
 
-	movq %rsp,%rdi			/* pt_regs pointer */
+	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
-	movq ORIG_RAX(%rsp),%rsi	/* get error code */
-	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
 	.else
-	xorl %esi,%esi			/* no error code */
+	xorl	%esi, %esi			/* no error code */
 	.endif
 
 	.if \shift_ist != -1
-	subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+	subq	$EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
-	call \do_sym
+	call	\do_sym
 
 	.if \shift_ist != -1
-	addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+	addq	$EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
 	/* these procedures expect "no swapgs" flag in ebx */
 	.if \paranoid
-	jmp paranoid_exit
+	jmp	paranoid_exit
 	.else
-	jmp error_exit
+	jmp	error_exit
 	.endif
 
 	.if \paranoid == 1
-	CFI_RESTORE_STATE
 	/*
 	 * Paranoid entry from userspace.  Switch stacks and treat it
 	 * as a normal entry.  This means that paranoid handlers
 	 * run in real process context if user_mode(regs).
 	 */
 1:
-	call error_entry
+	call	error_entry
 
-	DEFAULT_FRAME 0
 
-	movq %rsp,%rdi			/* pt_regs pointer */
-	call sync_regs
-	movq %rax,%rsp			/* switch stack */
+	movq	%rsp, %rdi			/* pt_regs pointer */
+	call	sync_regs
+	movq	%rax, %rsp			/* switch stack */
 
-	movq %rsp,%rdi			/* pt_regs pointer */
+	movq	%rsp, %rdi			/* pt_regs pointer */
 
 	.if \has_error_code
-	movq ORIG_RAX(%rsp),%rsi	/* get error code */
-	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
+	movq	ORIG_RAX(%rsp), %rsi		/* get error code */
+	movq	$-1, ORIG_RAX(%rsp)		/* no syscall to restart */
 	.else
-	xorl %esi,%esi			/* no error code */
+	xorl	%esi, %esi			/* no error code */
 	.endif
 
-	call \do_sym
+	call	\do_sym
 
-	jmp error_exit			/* %ebx: no swapgs flag */
+	jmp	error_exit			/* %ebx: no swapgs flag */
 	.endif
-
-	CFI_ENDPROC
 END(\sym)
 .endm
 
@@ -1079,65 +919,58 @@ idtentry \sym \do_sym has_error_code=\has_error_code
 .endm
 #endif
 
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
-	/* Reload gs selector with exception handling */
-	/* edi:  new selector */
+idtentry divide_error			do_divide_error			has_error_code=0
+idtentry overflow			do_overflow			has_error_code=0
+idtentry bounds				do_bounds			has_error_code=0
+idtentry invalid_op			do_invalid_op			has_error_code=0
+idtentry device_not_available		do_device_not_available		has_error_code=0
+idtentry double_fault			do_double_fault			has_error_code=1 paranoid=2
+idtentry coprocessor_segment_overrun	do_coprocessor_segment_overrun	has_error_code=0
+idtentry invalid_TSS			do_invalid_TSS			has_error_code=1
+idtentry segment_not_present		do_segment_not_present		has_error_code=1
+idtentry spurious_interrupt_bug		do_spurious_interrupt_bug	has_error_code=0
+idtentry coprocessor_error		do_coprocessor_error		has_error_code=0
+idtentry alignment_check		do_alignment_check		has_error_code=1
+idtentry simd_coprocessor_error		do_simd_coprocessor_error	has_error_code=0
+
+
+	/*
+	 * Reload gs selector with exception handling
+	 * edi:  new selector
+	 */
 ENTRY(native_load_gs_index)
-	CFI_STARTPROC
-	pushfq_cfi
+	pushfq
 	DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
 	SWAPGS
 gs_change:
-	movl %edi,%gs
-2:	mfence		/* workaround */
+	movl	%edi, %gs
+2:	mfence					/* workaround */
 	SWAPGS
-	popfq_cfi
+	popfq
 	ret
-	CFI_ENDPROC
 END(native_load_gs_index)
 
-	_ASM_EXTABLE(gs_change,bad_gs)
-	.section .fixup,"ax"
+	_ASM_EXTABLE(gs_change, bad_gs)
+	.section .fixup, "ax"
 	/* running with kernelgs */
 bad_gs:
-	SWAPGS			/* switch back to user gs */
-	xorl %eax,%eax
-	movl %eax,%gs
-	jmp  2b
+	SWAPGS					/* switch back to user gs */
+	xorl	%eax, %eax
+	movl	%eax, %gs
+	jmp	2b
 	.previous
 
 /* Call softirq on interrupt stack. Interrupts are off. */
 ENTRY(do_softirq_own_stack)
-	CFI_STARTPROC
-	pushq_cfi %rbp
-	CFI_REL_OFFSET rbp,0
-	mov  %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
-	incl PER_CPU_VAR(irq_count)
-	cmove PER_CPU_VAR(irq_stack_ptr),%rsp
-	push  %rbp			# backlink for old unwinder
-	call __do_softirq
+	pushq	%rbp
+	mov	%rsp, %rbp
+	incl	PER_CPU_VAR(irq_count)
+	cmove	PER_CPU_VAR(irq_stack_ptr), %rsp
+	push	%rbp				/* frame pointer backlink */
+	call	__do_softirq
 	leaveq
-	CFI_RESTORE		rbp
-	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET   -8
-	decl PER_CPU_VAR(irq_count)
+	decl	PER_CPU_VAR(irq_count)
 	ret
-	CFI_ENDPROC
 END(do_softirq_own_stack)
 
 #ifdef CONFIG_XEN
@@ -1156,29 +989,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
  * existing activation in its critical region -- if so, we pop the current
  * activation and restart the handler using the previous one.
  */
-ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
-	CFI_STARTPROC
+ENTRY(xen_do_hypervisor_callback)		/* do_hypervisor_callback(struct *pt_regs) */
+
 /*
  * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
  * see the correct pointer to the pt_regs
  */
-	movq %rdi, %rsp            # we don't return, adjust the stack frame
-	CFI_ENDPROC
-	DEFAULT_FRAME
-11:	incl PER_CPU_VAR(irq_count)
-	movq %rsp,%rbp
-	CFI_DEF_CFA_REGISTER rbp
-	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
-	pushq %rbp			# backlink for old unwinder
-	call xen_evtchn_do_upcall
-	popq %rsp
-	CFI_DEF_CFA_REGISTER rsp
-	decl PER_CPU_VAR(irq_count)
+	movq	%rdi, %rsp			/* we don't return, adjust the stack frame */
+11:	incl	PER_CPU_VAR(irq_count)
+	movq	%rsp, %rbp
+	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
+	pushq	%rbp				/* frame pointer backlink */
+	call	xen_evtchn_do_upcall
+	popq	%rsp
+	decl	PER_CPU_VAR(irq_count)
 #ifndef CONFIG_PREEMPT
-	call xen_maybe_preempt_hcall
+	call	xen_maybe_preempt_hcall
 #endif
-	jmp  error_exit
-	CFI_ENDPROC
+	jmp	error_exit
 END(xen_do_hypervisor_callback)
 
 /*
@@ -1195,51 +1023,35 @@ END(xen_do_hypervisor_callback)
  * with its current contents: any discrepancy means we in category 1.
  */
 ENTRY(xen_failsafe_callback)
-	INTR_FRAME 1 (6*8)
-	/*CFI_REL_OFFSET gs,GS*/
-	/*CFI_REL_OFFSET fs,FS*/
-	/*CFI_REL_OFFSET es,ES*/
-	/*CFI_REL_OFFSET ds,DS*/
-	CFI_REL_OFFSET r11,8
-	CFI_REL_OFFSET rcx,0
-	movw %ds,%cx
-	cmpw %cx,0x10(%rsp)
-	CFI_REMEMBER_STATE
-	jne 1f
-	movw %es,%cx
-	cmpw %cx,0x18(%rsp)
-	jne 1f
-	movw %fs,%cx
-	cmpw %cx,0x20(%rsp)
-	jne 1f
-	movw %gs,%cx
-	cmpw %cx,0x28(%rsp)
-	jne 1f
+	movl	%ds, %ecx
+	cmpw	%cx, 0x10(%rsp)
+	jne	1f
+	movl	%es, %ecx
+	cmpw	%cx, 0x18(%rsp)
+	jne	1f
+	movl	%fs, %ecx
+	cmpw	%cx, 0x20(%rsp)
+	jne	1f
+	movl	%gs, %ecx
+	cmpw	%cx, 0x28(%rsp)
+	jne	1f
 	/* All segments match their saved values => Category 2 (Bad IRET). */
-	movq (%rsp),%rcx
-	CFI_RESTORE rcx
-	movq 8(%rsp),%r11
-	CFI_RESTORE r11
-	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $0	/* RIP */
-	pushq_cfi %r11
-	pushq_cfi %rcx
-	jmp general_protection
-	CFI_RESTORE_STATE
+	movq	(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	pushq	$0				/* RIP */
+	pushq	%r11
+	pushq	%rcx
+	jmp	general_protection
 1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
-	movq (%rsp),%rcx
-	CFI_RESTORE rcx
-	movq 8(%rsp),%r11
-	CFI_RESTORE r11
-	addq $0x30,%rsp
-	CFI_ADJUST_CFA_OFFSET -0x30
-	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
+	movq	(%rsp), %rcx
+	movq	8(%rsp), %r11
+	addq	$0x30, %rsp
+	pushq	$-1 /* orig_ax = -1 => not a system call */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
-	jmp error_exit
-	CFI_ENDPROC
+	jmp	error_exit
 END(xen_failsafe_callback)
 
 apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1252,21 +1064,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
 	hyperv_callback_vector hyperv_vector_handler
 #endif /* CONFIG_HYPERV */
 
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry stack_segment do_stack_segment has_error_code=1
+idtentry debug			do_debug		has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3			do_int3			has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment		do_stack_segment	has_error_code=1
+
 #ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
+idtentry xen_debug		do_debug		has_error_code=0
+idtentry xen_int3		do_int3			has_error_code=0
+idtentry xen_stack_segment	do_stack_segment	has_error_code=1
 #endif
-idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+
+idtentry general_protection	do_general_protection	has_error_code=1
+trace_idtentry page_fault	do_page_fault		has_error_code=1
+
 #ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault	do_async_page_fault	has_error_code=1
 #endif
+
 #ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
+idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vector(%rip)
 #endif
 
 /*
@@ -1275,19 +1091,17 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
  * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(paranoid_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
-	movl $1,%ebx
-	movl $MSR_GS_BASE,%ecx
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
 	rdmsr
-	testl %edx,%edx
-	js 1f	/* negative -> in kernel */
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
 	SWAPGS
-	xorl %ebx,%ebx
+	xorl	%ebx, %ebx
 1:	ret
-	CFI_ENDPROC
 END(paranoid_entry)
 
 /*
@@ -1299,17 +1113,17 @@ END(paranoid_entry)
  * in syscall entry), so checking for preemption here would
  * be complicated.  Fortunately, we there's no good reason
  * to try to handle preemption here.
+ *
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
  */
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
-	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_exit_no_swapgs
+	testl	%ebx, %ebx			/* swapgs needed? */
+	jnz	paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
-	jmp paranoid_exit_restore
+	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
 paranoid_exit_restore:
@@ -1317,24 +1131,24 @@ paranoid_exit_restore:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
-	CFI_ENDPROC
 END(paranoid_exit)
 
 /*
  * Save all registers in pt_regs, and switch gs if needed.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Return: EBX=0: came from user mode; EBX=1: otherwise
  */
 ENTRY(error_entry)
-	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
-	xorl %ebx,%ebx
-	testl $3,CS+8(%rsp)
-	je error_kernelspace
-error_swapgs:
+	xorl	%ebx, %ebx
+	testb	$3, CS+8(%rsp)
+	jz	error_kernelspace
+
+	/* We entered from user mode */
 	SWAPGS
-error_sti:
+
+error_entry_done:
 	TRACE_IRQS_OFF
 	ret
 
@@ -1345,56 +1159,66 @@ error_sti:
 	 * for these here too.
 	 */
 error_kernelspace:
-	CFI_REL_OFFSET rcx, RCX+8
-	incl %ebx
-	leaq native_irq_return_iret(%rip),%rcx
-	cmpq %rcx,RIP+8(%rsp)
-	je error_bad_iret
-	movl %ecx,%eax	/* zero extend */
-	cmpq %rax,RIP+8(%rsp)
-	je bstep_iret
-	cmpq $gs_change,RIP+8(%rsp)
-	je error_swapgs
-	jmp error_sti
+	incl	%ebx
+	leaq	native_irq_return_iret(%rip), %rcx
+	cmpq	%rcx, RIP+8(%rsp)
+	je	error_bad_iret
+	movl	%ecx, %eax			/* zero extend */
+	cmpq	%rax, RIP+8(%rsp)
+	je	bstep_iret
+	cmpq	$gs_change, RIP+8(%rsp)
+	jne	error_entry_done
+
+	/*
+	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
+	 * gsbase and proceed.  We'll fix up the exception and land in
+	 * gs_change's error handler with kernel gsbase.
+	 */
+	SWAPGS
+	jmp	error_entry_done
 
 bstep_iret:
 	/* Fix truncated RIP */
-	movq %rcx,RIP+8(%rsp)
+	movq	%rcx, RIP+8(%rsp)
 	/* fall through */
 
 error_bad_iret:
+	/*
+	 * We came from an IRET to user mode, so we have user gsbase.
+	 * Switch to kernel gsbase:
+	 */
 	SWAPGS
-	mov %rsp,%rdi
-	call fixup_bad_iret
-	mov %rax,%rsp
-	decl %ebx	/* Return to usergs */
-	jmp error_sti
-	CFI_ENDPROC
+
+	/*
+	 * Pretend that the exception came from user mode: set up pt_regs
+	 * as if we faulted immediately after IRET and clear EBX so that
+	 * error_exit knows that we will be returning to user mode.
+	 */
+	mov	%rsp, %rdi
+	call	fixup_bad_iret
+	mov	%rax, %rsp
+	decl	%ebx
+	jmp	error_entry_done
 END(error_entry)
 
 
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
+/*
+ * On entry, EBS is a "return to kernel mode" flag:
+ *   1: already in kernel mode, don't need SWAPGS
+ *   0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
+ */
 ENTRY(error_exit)
-	DEFAULT_FRAME
-	movl %ebx,%eax
+	movl	%ebx, %eax
 	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	testl %eax,%eax
-	jne retint_kernel
-	LOCKDEP_SYS_EXIT_IRQ
-	movl TI_flags(%rcx),%edx
-	movl $_TIF_WORK_MASK,%edi
-	andl %edi,%edx
-	jnz retint_careful
-	jmp retint_swapgs
-	CFI_ENDPROC
+	testl	%eax, %eax
+	jnz	retint_kernel
+	jmp	retint_user
 END(error_exit)
 
 /* Runs on exception stack */
 ENTRY(nmi)
-	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
@@ -1429,22 +1253,21 @@ ENTRY(nmi)
 	 */
 
 	/* Use %rdx as our temp variable throughout */
-	pushq_cfi %rdx
-	CFI_REL_OFFSET rdx, 0
+	pushq	%rdx
 
 	/*
 	 * If %cs was not the kernel segment, then the NMI triggered in user
 	 * space, which means it is definitely not nested.
 	 */
-	cmpl $__KERNEL_CS, 16(%rsp)
-	jne first_nmi
+	cmpl	$__KERNEL_CS, 16(%rsp)
+	jne	first_nmi
 
 	/*
 	 * Check the special variable on the stack to see if NMIs are
 	 * executing.
 	 */
-	cmpl $1, -8(%rsp)
-	je nested_nmi
+	cmpl	$1, -8(%rsp)
+	je	nested_nmi
 
 	/*
 	 * Now test if the previous stack was an NMI stack.
@@ -1458,51 +1281,46 @@ ENTRY(nmi)
 	cmpq	%rdx, 4*8(%rsp)
 	/* If the stack pointer is above the NMI stack, this is a normal NMI */
 	ja	first_nmi
+
 	subq	$EXCEPTION_STKSZ, %rdx
 	cmpq	%rdx, 4*8(%rsp)
 	/* If it is below the NMI stack, it is a normal NMI */
 	jb	first_nmi
 	/* Ah, it is within the NMI stack, treat it as nested */
 
-	CFI_REMEMBER_STATE
-
 nested_nmi:
 	/*
 	 * Do nothing if we interrupted the fixup in repeat_nmi.
 	 * It's about to repeat the NMI handler, so we are fine
 	 * with ignoring this one.
 	 */
-	movq $repeat_nmi, %rdx
-	cmpq 8(%rsp), %rdx
-	ja 1f
-	movq $end_repeat_nmi, %rdx
-	cmpq 8(%rsp), %rdx
-	ja nested_nmi_out
+	movq	$repeat_nmi, %rdx
+	cmpq	8(%rsp), %rdx
+	ja	1f
+	movq	$end_repeat_nmi, %rdx
+	cmpq	8(%rsp), %rdx
+	ja	nested_nmi_out
 
 1:
 	/* Set up the interrupted NMIs stack to jump to repeat_nmi */
-	leaq -1*8(%rsp), %rdx
-	movq %rdx, %rsp
-	CFI_ADJUST_CFA_OFFSET 1*8
-	leaq -10*8(%rsp), %rdx
-	pushq_cfi $__KERNEL_DS
-	pushq_cfi %rdx
-	pushfq_cfi
-	pushq_cfi $__KERNEL_CS
-	pushq_cfi $repeat_nmi
+	leaq	-1*8(%rsp), %rdx
+	movq	%rdx, %rsp
+	leaq	-10*8(%rsp), %rdx
+	pushq	$__KERNEL_DS
+	pushq	%rdx
+	pushfq
+	pushq	$__KERNEL_CS
+	pushq	$repeat_nmi
 
 	/* Put stack back */
-	addq $(6*8), %rsp
-	CFI_ADJUST_CFA_OFFSET -6*8
+	addq	$(6*8), %rsp
 
 nested_nmi_out:
-	popq_cfi %rdx
-	CFI_RESTORE rdx
+	popq	%rdx
 
 	/* No need to check faults here */
 	INTERRUPT_RETURN
 
-	CFI_RESTORE_STATE
 first_nmi:
 	/*
 	 * Because nested NMIs will use the pushed location that we
@@ -1540,23 +1358,18 @@ first_nmi:
 	 * is also used by nested NMIs and can not be trusted on exit.
 	 */
 	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
-	movq (%rsp), %rdx
-	CFI_RESTORE rdx
+	movq	(%rsp), %rdx
 
 	/* Set the NMI executing variable on the stack. */
-	pushq_cfi $1
+	pushq	$1
 
-	/*
-	 * Leave room for the "copied" frame
-	 */
-	subq $(5*8), %rsp
-	CFI_ADJUST_CFA_OFFSET 5*8
+	/* Leave room for the "copied" frame */
+	subq	$(5*8), %rsp
 
 	/* Copy the stack frame to the Saved frame */
 	.rept 5
-	pushq_cfi 11*8(%rsp)
+	pushq	11*8(%rsp)
 	.endr
-	CFI_DEF_CFA_OFFSET 5*8
 
 	/* Everything up to here is safe from nested NMIs */
 
@@ -1575,16 +1388,14 @@ repeat_nmi:
 	 * is benign for the non-repeat case, where 1 was pushed just above
 	 * to this very stack slot).
 	 */
-	movq $1, 10*8(%rsp)
+	movq	$1, 10*8(%rsp)
 
 	/* Make another copy, this one may be modified by nested NMIs */
-	addq $(10*8), %rsp
-	CFI_ADJUST_CFA_OFFSET -10*8
+	addq	$(10*8), %rsp
 	.rept 5
-	pushq_cfi -6*8(%rsp)
+	pushq	-6*8(%rsp)
 	.endr
-	subq $(5*8), %rsp
-	CFI_DEF_CFA_OFFSET 5*8
+	subq	$(5*8), %rsp
 end_repeat_nmi:
 
 	/*
@@ -1592,7 +1403,7 @@ end_repeat_nmi:
 	 * NMI if the first NMI took an exception and reset our iret stack
 	 * so that we repeat another NMI.
 	 */
-	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
+	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
@@ -1602,8 +1413,7 @@ end_repeat_nmi:
 	 * setting NEED_RESCHED or anything that normal interrupts and
 	 * exceptions might do.
 	 */
-	call paranoid_entry
-	DEFAULT_FRAME 0
+	call	paranoid_entry
 
 	/*
 	 * Save off the CR2 register. If we take a page fault in the NMI then
@@ -1614,22 +1424,21 @@ end_repeat_nmi:
 	 * origin fault. Save it off and restore it if it changes.
 	 * Use the r12 callee-saved register.
 	 */
-	movq %cr2, %r12
+	movq	%cr2, %r12
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-	movq %rsp,%rdi
-	movq $-1,%rsi
-	call do_nmi
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
+	call	do_nmi
 
 	/* Did the NMI take a page fault? Restore cr2 if it did */
-	movq %cr2, %rcx
-	cmpq %rcx, %r12
-	je 1f
-	movq %r12, %cr2
+	movq	%cr2, %rcx
+	cmpq	%rcx, %r12
+	je	1f
+	movq	%r12, %cr2
 1:
-	
-	testl %ebx,%ebx				/* swapgs needed? */
-	jnz nmi_restore
+	testl	%ebx, %ebx			/* swapgs needed? */
+	jnz	nmi_restore
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
@@ -1639,15 +1448,11 @@ nmi_restore:
 	REMOVE_PT_GPREGS_FROM_STACK 6*8
 
 	/* Clear the NMI executing stack variable */
-	movq $0, 5*8(%rsp)
-	jmp irq_return
-	CFI_ENDPROC
+	movq	$0, 5*8(%rsp)
+	INTERRUPT_RETURN
 END(nmi)
 
 ENTRY(ignore_sysret)
-	CFI_STARTPROC
-	mov $-ENOSYS,%eax
+	mov	$-ENOSYS, %eax
 	sysret
-	CFI_ENDPROC
 END(ignore_sysret)
-
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
new file mode 100644
index 000000000000..bb187a6a877c
--- /dev/null
+++ b/arch/x86/entry/entry_64_compat.S
@@ -0,0 +1,556 @@
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+#include "calling.h"
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE		0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+# define sysexit_audit		ia32_ret_from_sys_call
+# define sysretl_audit		ia32_ret_from_sys_call
+#endif
+
+	.section .entry.text, "ax"
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+	swapgs
+	sysretl
+ENDPROC(native_usergs_sysret32)
+#endif
+
+/*
+ * 32-bit SYSENTER instruction entry.
+ *
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(entry_SYSENTER_compat)
+	/*
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
+	SWAPGS_UNSAFE_STACK
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%ebp, %ebp
+	movl	%eax, %eax
+
+	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%rbp			/* pt_regs->sp */
+	pushfq				/* pt_regs->flags */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	%r10			/* pt_regs->ip = thread_info->sysenter_return */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	cld
+	sub	$(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
+
+	/*
+	 * no need to do an access_ok check here because rbp has been
+	 * 32-bit zero extended
+	 */
+	ASM_STAC
+1:	movl	(%rbp), %ebp
+	_ASM_EXTABLE(1b, ia32_badarg)
+	ASM_CLAC
+
+	/*
+	 * Sysenter doesn't filter flags, so we need to clear NT
+	 * ourselves.  To save a few cycles, we can check whether
+	 * NT was set instead of doing an unconditional popfq.
+	 */
+	testl	$X86_EFLAGS_NT, EFLAGS(%rsp)
+	jnz	sysenter_fix_flags
+sysenter_flags_fixed:
+
+	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	sysenter_tracesys
+
+sysenter_do_call:
+	/* 32-bit syscall -> 64-bit C ABI argument conversion */
+	movl	%edi, %r8d		/* arg5 */
+	movl	%ebp, %r9d		/* arg6 */
+	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
+	movl	%ebx, %edi		/* arg1 */
+	movl	%edx, %edx		/* arg3 (zero extension) */
+sysenter_dispatch:
+	cmpq	$(IA32_NR_syscalls-1), %rax
+	ja	1f
+	call	*ia32_sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
+1:
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	sysexit_audit
+sysexit_from_sys_call:
+	/*
+	 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
+	 * NMI between STI and SYSEXIT has poorly specified behavior,
+	 * and and NMI followed by an IRQ with usergs is fatal.  So
+	 * we just pretend we're using SYSEXIT but we really use
+	 * SYSRETL instead.
+	 *
+	 * This code path is still called 'sysexit' because it pairs
+	 * with 'sysenter' and it uses the SYSENTER calling convention.
+	 */
+	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	movl	RIP(%rsp), %ecx		/* User %eip */
+	RESTORE_RSI_RDI
+	xorl	%edx, %edx		/* Do not leak kernel information */
+	xorq	%r8, %r8
+	xorq	%r9, %r9
+	xorq	%r10, %r10
+	movl	EFLAGS(%rsp), %r11d	/* User eflags */
+	TRACE_IRQS_ON
+
+	/*
+	 * SYSRETL works even on Intel CPUs.  Use it in preference to SYSEXIT,
+	 * since it avoids a dicey window with interrupts enabled.
+	 */
+	movl	RSP(%rsp), %esp
+
+	/*
+	 * USERGS_SYSRET32 does:
+	 *  gsbase = user's gs base
+	 *  eip = ecx
+	 *  rflags = r11
+	 *  cs = __USER32_CS
+	 *  ss = __USER_DS
+	 *
+	 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
+	 *
+	 *  pop %ebp
+	 *  pop %edx
+	 *  pop %ecx
+	 *
+	 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
+	 * avoid info leaks.  R11 ends up with VDSO32_SYSENTER_RETURN's
+	 * address (already known to user code), and R12-R15 are
+	 * callee-saved and therefore don't contain any interesting
+	 * kernel data.
+	 */
+	USERGS_SYSRET32
+
+#ifdef CONFIG_AUDITSYSCALL
+	.macro auditsys_entry_common
+	/*
+	 * At this point, registers hold syscall args in the 32-bit syscall ABI:
+	 * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
+	 *
+	 * We want to pass them to __audit_syscall_entry(), which is a 64-bit
+	 * C function with 5 parameters, so shuffle them to match what
+	 * the function expects: RDI,RSI,RDX,RCX,R8.
+	 */
+	movl	%esi, %r8d		/* arg5 (R8 ) <= 4th syscall arg (ESI) */
+	xchg	%ecx, %edx		/* arg4 (RCX) <= 3rd syscall arg (EDX) */
+					/* arg3 (RDX) <= 2nd syscall arg (ECX) */
+	movl	%ebx, %esi		/* arg2 (RSI) <= 1st syscall arg (EBX) */
+	movl	%eax, %edi		/* arg1 (RDI) <= syscall number  (EAX) */
+	call	__audit_syscall_entry
+
+	/*
+	 * We are going to jump back to the syscall dispatch code.
+	 * Prepare syscall args as required by the 64-bit C ABI.
+	 * Registers clobbered by __audit_syscall_entry() are
+	 * loaded from pt_regs on stack:
+	 */
+	movl	ORIG_RAX(%rsp), %eax	/* syscall number */
+	movl	%ebx, %edi		/* arg1 */
+	movl	RCX(%rsp), %esi		/* arg2 */
+	movl	RDX(%rsp), %edx		/* arg3 */
+	movl	RSI(%rsp), %ecx		/* arg4 */
+	movl	RDI(%rsp), %r8d		/* arg5 */
+	movl	%ebp, %r9d		/* arg6 */
+	.endm
+
+	.macro auditsys_exit exit
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	ia32_ret_from_sys_call
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	movl	%eax, %esi		/* second arg, syscall return value */
+	cmpl	$-MAX_ERRNO, %eax	/* is it an error ? */
+	jbe	1f
+	movslq	%eax, %rsi		/* if error sign extend to 64 bits */
+1:	setbe	%al			/* 1 if error, 0 if not */
+	movzbl	%al, %edi		/* zero-extend that into %edi */
+	call	__audit_syscall_exit
+	movq	RAX(%rsp), %rax		/* reload syscall return value */
+	movl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	testl	%edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jz	\exit
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	jmp	int_with_check
+	.endm
+
+sysenter_auditsys:
+	auditsys_entry_common
+	jmp	sysenter_dispatch
+
+sysexit_audit:
+	auditsys_exit sysexit_from_sys_call
+#endif
+
+sysenter_fix_flags:
+	pushq	$(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+	popfq
+	jmp	sysenter_flags_fixed
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jz	sysenter_auditsys
+#endif
+	SAVE_EXTRA_REGS
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	movq	%rsp, %rdi		/* &pt_regs -> arg1 */
+	call	syscall_trace_enter
+
+	/* Reload arg registers from stack. (see sysenter_tracesys) */
+	movl	RCX(%rsp), %ecx
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
+	movl	%eax, %eax		/* zero extension */
+
+	RESTORE_EXTRA_REGS
+	jmp	sysenter_do_call
+ENDPROC(entry_SYSENTER_compat)
+
+/*
+ * 32-bit SYSCALL instruction entry.
+ *
+ * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
+ * Arguments:
+ * eax  system call number
+ * ecx  return address
+ * ebx  arg1
+ * ebp  arg2	(note: not saved in the stack frame, should not be touched)
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * esp  user stack
+ * 0(%esp) arg6
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(entry_SYSCALL_compat)
+	/*
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
+	SWAPGS_UNSAFE_STACK
+	movl	%esp, %r8d
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%eax, %eax
+
+	/* Construct struct pt_regs on stack */
+	pushq	$__USER32_DS		/* pt_regs->ss */
+	pushq	%r8			/* pt_regs->sp */
+	pushq	%r11			/* pt_regs->flags */
+	pushq	$__USER32_CS		/* pt_regs->cs */
+	pushq	%rcx			/* pt_regs->ip */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rbp			/* pt_regs->cx */
+	movl	%ebp, %ecx
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	sub	$(10*8), %rsp		/* pt_regs->r8-11, bp, bx, r12-15 not saved */
+
+	/*
+	 * No need to do an access_ok check here because r8 has been
+	 * 32-bit zero extended:
+	 */
+	ASM_STAC
+1:	movl	(%r8), %ebp
+	_ASM_EXTABLE(1b, ia32_badarg)
+	ASM_CLAC
+	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	cstar_tracesys
+
+cstar_do_call:
+	/* 32-bit syscall -> 64-bit C ABI argument conversion */
+	movl	%edi, %r8d		/* arg5 */
+	movl	%ebp, %r9d		/* arg6 */
+	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
+	movl	%ebx, %edi		/* arg1 */
+	movl	%edx, %edx		/* arg3 (zero extension) */
+
+cstar_dispatch:
+	cmpq	$(IA32_NR_syscalls-1), %rax
+	ja	1f
+
+	call	*ia32_sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
+1:
+	movl	RCX(%rsp), %ebp
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	sysretl_audit
+
+sysretl_from_sys_call:
+	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	RESTORE_RSI_RDI_RDX
+	movl	RIP(%rsp), %ecx
+	movl	EFLAGS(%rsp), %r11d
+	xorq	%r10, %r10
+	xorq	%r9, %r9
+	xorq	%r8, %r8
+	TRACE_IRQS_ON
+	movl	RSP(%rsp), %esp
+	/*
+	 * 64-bit->32-bit SYSRET restores eip from ecx,
+	 * eflags from r11 (but RF and VM bits are forced to 0),
+	 * cs and ss are loaded from MSRs.
+	 * (Note: 32-bit->32-bit SYSRET is different: since r11
+	 * does not exist, it merely sets eflags.IF=1).
+	 *
+	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+	 * descriptor is not reinitialized.  This means that we must
+	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
+	 * exit the kernel, and re-enter using an interrupt vector.  (All
+	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
+	 * from happening by reloading SS in __switch_to.
+	 */
+	USERGS_SYSRET32
+
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+	auditsys_entry_common
+	jmp	cstar_dispatch
+
+sysretl_audit:
+	auditsys_exit sysretl_from_sys_call
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jz	cstar_auditsys
+#endif
+	SAVE_EXTRA_REGS
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	movq	%rsp, %rdi		/* &pt_regs -> arg1 */
+	call	syscall_trace_enter
+
+	/* Reload arg registers from stack. (see sysenter_tracesys) */
+	movl	RCX(%rsp), %ecx
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
+	movl	%eax, %eax		/* zero extension */
+
+	RESTORE_EXTRA_REGS
+	jmp	cstar_do_call
+END(entry_SYSCALL_compat)
+
+ia32_badarg:
+	ASM_CLAC
+	movq	$-EFAULT, RAX(%rsp)
+ia32_ret_from_sys_call:
+	xorl	%eax, %eax		/* Do not leak kernel information */
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	jmp	int_ret_from_sys_call
+
+/*
+ * Emulated IA32 system calls via int 0x80.
+ *
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6	(note: not saved in the stack frame, should not be touched)
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
+
+ENTRY(entry_INT80_compat)
+	/*
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	SWAPGS
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%eax, %eax
+
+	/* Construct struct pt_regs on stack (iret frame is already on stack) */
+	pushq	%rax			/* pt_regs->orig_ax */
+	pushq	%rdi			/* pt_regs->di */
+	pushq	%rsi			/* pt_regs->si */
+	pushq	%rdx			/* pt_regs->dx */
+	pushq	%rcx			/* pt_regs->cx */
+	pushq	$-ENOSYS		/* pt_regs->ax */
+	pushq	$0			/* pt_regs->r8 */
+	pushq	$0			/* pt_regs->r9 */
+	pushq	$0			/* pt_regs->r10 */
+	pushq	$0			/* pt_regs->r11 */
+	cld
+	sub	$(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+
+	orl	$TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl	$_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	ia32_tracesys
+
+ia32_do_call:
+	/* 32-bit syscall -> 64-bit C ABI argument conversion */
+	movl	%edi, %r8d		/* arg5 */
+	movl	%ebp, %r9d		/* arg6 */
+	xchg	%ecx, %esi		/* rsi:arg2, rcx:arg4 */
+	movl	%ebx, %edi		/* arg1 */
+	movl	%edx, %edx		/* arg3 (zero extension) */
+	cmpq	$(IA32_NR_syscalls-1), %rax
+	ja	1f
+
+	call	*ia32_sys_call_table(, %rax, 8)
+	movq	%rax, RAX(%rsp)
+1:
+	jmp	int_ret_from_sys_call
+
+ia32_tracesys:
+	SAVE_EXTRA_REGS
+	movq	%rsp, %rdi			/* &pt_regs -> arg1 */
+	call	syscall_trace_enter
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * Don't reload %eax because syscall_trace_enter() returned
+	 * the %rax value we should see.  But do truncate it to 32 bits.
+	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
+	 * an appropriately invalid value.
+	 */
+	movl	RCX(%rsp), %ecx
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
+	movl	%eax, %eax		/* zero extension */
+	RESTORE_EXTRA_REGS
+	jmp	ia32_do_call
+END(entry_INT80_compat)
+
+	.macro PTREGSCALL label, func
+	ALIGN
+GLOBAL(\label)
+	leaq	\func(%rip), %rax
+	jmp	ia32_ptregs_common
+	.endm
+
+	PTREGSCALL stub32_rt_sigreturn,	sys32_rt_sigreturn
+	PTREGSCALL stub32_sigreturn,	sys32_sigreturn
+	PTREGSCALL stub32_fork,		sys_fork
+	PTREGSCALL stub32_vfork,	sys_vfork
+
+	ALIGN
+GLOBAL(stub32_clone)
+	leaq	sys_clone(%rip), %rax
+	/*
+	 * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
+	 * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
+	 *
+	 * The native 64-bit kernel's sys_clone() implements the latter,
+	 * so we need to swap arguments here before calling it:
+	 */
+	xchg	%r8, %rcx
+	jmp	ia32_ptregs_common
+
+	ALIGN
+ia32_ptregs_common:
+	SAVE_EXTRA_REGS 8
+	call	*%rax
+	RESTORE_EXTRA_REGS 8
+	ret
+END(ia32_ptregs_common)
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/entry/syscall_32.c
index 3777189c4a19..8ea34f94e973 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -10,7 +10,7 @@
 #else
 #define SYM(sym, compat) sym
 #define ia32_sys_call_table sys_call_table
-#define __NR_ia32_syscall_max __NR_syscall_max
+#define __NR_syscall_compat_max __NR_syscall_max
 #endif
 
 #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
@@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void);
 
 extern asmlinkage void sys_ni_syscall(void);
 
-__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
-	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
+	[0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/entry/syscall_64.c
index 4ac730b37f0b..4ac730b37f0b 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index a55abb9f6c5e..57aa59fd140c 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -1,5 +1,5 @@
-out := $(obj)/../include/generated/asm
-uapi := $(obj)/../include/generated/uapi/asm
+out := $(obj)/../../include/generated/asm
+uapi := $(obj)/../../include/generated/uapi/asm
 
 # Create output directory if not already present
 _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d28d..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh
index 31fd5f1f38f7..31fd5f1f38f7 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/entry/syscalls/syscallhdr.sh
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..0e7f8ec071e7 100644
--- a/arch/x86/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/entry/thunk_32.S
index 5eb715087b80..e5a17114a8c4 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -6,16 +6,14 @@
  */
 	#include <linux/linkage.h>
 	#include <asm/asm.h>
-	#include <asm/dwarf2.h>
 
 	/* put return address in eax (arg1) */
 	.macro THUNK name, func, put_ret_addr_in_eax=0
 	.globl \name
 \name:
-	CFI_STARTPROC
-	pushl_cfi_reg eax
-	pushl_cfi_reg ecx
-	pushl_cfi_reg edx
+	pushl %eax
+	pushl %ecx
+	pushl %edx
 
 	.if \put_ret_addr_in_eax
 	/* Place EIP in the arg1 */
@@ -23,11 +21,10 @@
 	.endif
 
 	call \func
-	popl_cfi_reg edx
-	popl_cfi_reg ecx
-	popl_cfi_reg eax
+	popl %edx
+	popl %ecx
+	popl %eax
 	ret
-	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
 	.endm
 
@@ -38,8 +35,6 @@
 
 #ifdef CONFIG_PREEMPT
 	THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-	THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+	THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
 #endif
 
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/entry/thunk_64.S
index f89ba4e93025..efb2b932b748 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -6,35 +6,32 @@
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
 #include <asm/asm.h>
 
 	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
 	.macro THUNK name, func, put_ret_addr_in_rdi=0
 	.globl \name
 \name:
-	CFI_STARTPROC
 
 	/* this one pushes 9 elems, the next one would be %rIP */
-	pushq_cfi_reg rdi
-	pushq_cfi_reg rsi
-	pushq_cfi_reg rdx
-	pushq_cfi_reg rcx
-	pushq_cfi_reg rax
-	pushq_cfi_reg r8
-	pushq_cfi_reg r9
-	pushq_cfi_reg r10
-	pushq_cfi_reg r11
+	pushq %rdi
+	pushq %rsi
+	pushq %rdx
+	pushq %rcx
+	pushq %rax
+	pushq %r8
+	pushq %r9
+	pushq %r10
+	pushq %r11
 
 	.if \put_ret_addr_in_rdi
 	/* 9*8(%rsp) is return addr on stack */
-	movq_cfi_restore 9*8, rdi
+	movq 9*8(%rsp), %rdi
 	.endif
 
 	call \func
 	jmp  restore
-	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
 	.endm
 
@@ -49,27 +46,22 @@
 
 #ifdef CONFIG_PREEMPT
 	THUNK ___preempt_schedule, preempt_schedule
-#ifdef CONFIG_CONTEXT_TRACKING
-	THUNK ___preempt_schedule_context, preempt_schedule_context
-#endif
+	THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
 #endif
 
 #if defined(CONFIG_TRACE_IRQFLAGS) \
  || defined(CONFIG_DEBUG_LOCK_ALLOC) \
  || defined(CONFIG_PREEMPT)
-	CFI_STARTPROC
-	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	popq_cfi_reg r11
-	popq_cfi_reg r10
-	popq_cfi_reg r9
-	popq_cfi_reg r8
-	popq_cfi_reg rax
-	popq_cfi_reg rcx
-	popq_cfi_reg rdx
-	popq_cfi_reg rsi
-	popq_cfi_reg rdi
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
 	ret
-	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
 #endif
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore
index aae8ffdd5880..aae8ffdd5880 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/entry/vdso/.gitignore
diff --git a/arch/x86/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index e97032069f88..e97032069f88 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
index 7ee90a9b549d..7ee90a9b549d 100755
--- a/arch/x86/vdso/checkundef.sh
+++ b/arch/x86/entry/vdso/checkundef.sh
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9793322751e0..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..de2c921025f5 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
index 79a071e4357e..79a071e4357e 100644
--- a/arch/x86/vdso/vdso-note.S
+++ b/arch/x86/entry/vdso/vdso-note.S
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 6807932643c2..6807932643c2 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 8627db24a7f6..8627db24a7f6 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987556ce..0224987556ce 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index e904c270573b..e904c270573b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore
index e45fba9d0ced..e45fba9d0ced 100644
--- a/arch/x86/vdso/vdso32/.gitignore
+++ b/arch/x86/entry/vdso/vdso32/.gitignore
diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S
index b15b7c01aedb..b15b7c01aedb 100644
--- a/arch/x86/vdso/vdso32/int80.S
+++ b/arch/x86/entry/vdso/vdso32/int80.S
diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
index c83f25734696..c83f25734696 100644
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/entry/vdso/vdso32/note.S
diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
index d7ec4e251c0a..d7ec4e251c0a 100644
--- a/arch/x86/vdso/vdso32/sigreturn.S
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S
index 6b286bb5251c..6b286bb5251c 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/entry/vdso/vdso32/syscall.S
diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S
index e354bceee0e0..e354bceee0e0 100644
--- a/arch/x86/vdso/vdso32/sysenter.S
+++ b/arch/x86/entry/vdso/vdso32/sysenter.S
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 175cc72c0f68..175cc72c0f68 100644
--- a/arch/x86/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
index 541468e25265..541468e25265 100644
--- a/arch/x86/vdso/vdso32/vdso-fakesections.c
+++ b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf294bf..31056cf294bf 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
index 697c11ece90c..697c11ece90c 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f4ce9a..8ec3d1f4ce9a 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
diff --git a/arch/x86/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1c9f750c3859..1c9f750c3859 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
new file mode 100644
index 000000000000..a9f4856f622a
--- /dev/null
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the x86 low level vsyscall code
+#
+obj-y					:= vsyscall_gtod.o
+
+obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
+
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff6fdcc..2dcc6ff6fdcc 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index c9596a9af159..c9596a9af159 100644
--- a/arch/x86/kernel/vsyscall_emu_64.S
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416995..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h
index a8b2edec54fe..9dd7359a38a8 100644
--- a/arch/x86/kernel/vsyscall_trace.h
+++ b/arch/x86/entry/vsyscall/vsyscall_trace.h
@@ -24,6 +24,6 @@ TRACE_EVENT(emulate_vsyscall,
 #endif
 
 #undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../arch/x86/kernel
+#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
 #define TRACE_INCLUDE_FILE vsyscall_trace
 #include <trace/define_trace.h>
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index bb635c641869..cd4339bae066 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the ia32 kernel emulation subsystem.
 #
 
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
+obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o
 
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index c81d35e6c7f1..ae3a29ae875b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -21,8 +21,8 @@
 #include <linux/binfmts.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
 #include <asm/ptrace.h>
 #include <asm/ia32_unistd.h>
 #include <asm/user32.h>
@@ -198,7 +198,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 		buf = compat_ptr(tmp);
 	} get_user_catch(err);
 
-	err |= restore_xstate_sig(buf, 1);
+	err |= fpu__restore_sig(buf, 1);
 
 	force_iret();
 
@@ -308,6 +308,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 				 size_t frame_size,
 				 void __user **fpstate)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	unsigned long sp;
 
 	/* Default to using normal stack */
@@ -322,12 +323,12 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 		 ksig->ka.sa.sa_restorer)
 		sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
-	if (used_math()) {
+	if (fpu->fpstate_active) {
 		unsigned long fx_aligned, math_size;
 
-		sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size);
+		sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
 		*fpstate = (struct _fpstate_ia32 __user *) sp;
-		if (save_xstate_sig(*fpstate, (void __user *)fx_aligned,
+		if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
 				    math_size) < 0)
 			return (void __user *) -1L;
 	}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
deleted file mode 100644
index 72bf2680f819..000000000000
--- a/arch/x86/ia32/ia32entry.S
+++ /dev/null
@@ -1,611 +0,0 @@
-/*
- * Compatibility mode system call entry point for x86-64. 
- * 		
- * Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */		 
-
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/current.h>
-#include <asm/errno.h>
-#include <asm/ia32_unistd.h>	
-#include <asm/thread_info.h>	
-#include <asm/segment.h>
-#include <asm/irqflags.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-#include <linux/linkage.h>
-#include <linux/err.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE	   0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit ia32_ret_from_sys_call
-#define sysretl_audit ia32_ret_from_sys_call
-#endif
-
-	.section .entry.text, "ax"
-
-	/* clobbers %rax */
-	.macro  CLEAR_RREGS _r9=rax
-	xorl 	%eax,%eax
-	movq	%rax,R11(%rsp)
-	movq	%rax,R10(%rsp)
-	movq	%\_r9,R9(%rsp)
-	movq	%rax,R8(%rsp)
-	.endm
-
-	/*
-	 * Reload arg registers from stack in case ptrace changed them.
-	 * We don't reload %eax because syscall_trace_enter() returned
-	 * the %rax value we should see.  Instead, we just truncate that
-	 * value to 32 bits again as we did on entry from user mode.
-	 * If it's a new value set by user_regset during entry tracing,
-	 * this matches the normal truncation of the user-mode value.
-	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
-	 * an appropriately invalid value.
-	 */
-	.macro LOAD_ARGS32 _r9=0
-	.if \_r9
-	movl R9(%rsp),%r9d
-	.endif
-	movl RCX(%rsp),%ecx
-	movl RDX(%rsp),%edx
-	movl RSI(%rsp),%esi
-	movl RDI(%rsp),%edi
-	movl %eax,%eax			/* zero extension */
-	.endm
-	
-	.macro CFI_STARTPROC32 simple
-	CFI_STARTPROC	\simple
-	CFI_UNDEFINED	r8
-	CFI_UNDEFINED	r9
-	CFI_UNDEFINED	r10
-	CFI_UNDEFINED	r11
-	CFI_UNDEFINED	r12
-	CFI_UNDEFINED	r13
-	CFI_UNDEFINED	r14
-	CFI_UNDEFINED	r15
-	.endm
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
-	swapgs
-	sysretl
-ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
-	swapgs
-	sti
-	sysexit
-ENDPROC(native_irq_enable_sysexit)
-#endif
-
-/*
- * 32bit SYSENTER instruction entry.
- *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
- * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
- *
- * Arguments:
- * eax  system call number
- * ebx  arg1
- * ecx  arg2
- * edx  arg3
- * esi  arg4
- * edi  arg5
- * ebp  user stack
- * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rsp,rbp
-
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%ebp, %ebp
-	movl	%eax, %eax
-
-	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
-	CFI_REGISTER rip,r10
-
-	/* Construct struct pt_regs on stack */
-	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
-	pushq_cfi	%rbp			/* pt_regs->sp */
-	CFI_REL_OFFSET	rsp,0
-	pushfq_cfi				/* pt_regs->flags */
-	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
-	pushq_cfi	%r10 /* pt_regs->ip = thread_info->sysenter_return */
-	CFI_REL_OFFSET	rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
-	cld
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
-
-	/*
-	 * no need to do an access_ok check here because rbp has been
-	 * 32bit zero extended
-	 */
-	ASM_STAC
-1:	movl	(%rbp),%ebp
-	_ASM_EXTABLE(1b,ia32_badarg)
-	ASM_CLAC
-
-	/*
-	 * Sysenter doesn't filter flags, so we need to clear NT
-	 * ourselves.  To save a few cycles, we can check whether
-	 * NT was set instead of doing an unconditional popfq.
-	 */
-	testl $X86_EFLAGS_NT,EFLAGS(%rsp)
-	jnz sysenter_fix_flags
-sysenter_flags_fixed:
-
-	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	CFI_REMEMBER_STATE
-	jnz  sysenter_tracesys
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	ia32_badsys
-sysenter_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl	%edi,%r8d	/* arg5 */
-	movl	%ebp,%r9d	/* arg6 */
-	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl	%ebx,%edi	/* arg1 */
-	movl	%edx,%edx	/* arg3 (zero extension) */
-sysenter_dispatch:
-	call	*ia32_sys_call_table(,%rax,8)
-	movq	%rax,RAX(%rsp)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	sysexit_audit
-sysexit_from_sys_call:
-	/*
-	 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
-	 * NMI between STI and SYSEXIT has poorly specified behavior,
-	 * and and NMI followed by an IRQ with usergs is fatal.  So
-	 * we just pretend we're using SYSEXIT but we really use
-	 * SYSRETL instead.
-	 *
-	 * This code path is still called 'sysexit' because it pairs
-	 * with 'sysenter' and it uses the SYSENTER calling convention.
-	 */
-	andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	movl	RIP(%rsp),%ecx		/* User %eip */
-	CFI_REGISTER rip,rcx
-	RESTORE_RSI_RDI
-	xorl	%edx,%edx		/* avoid info leaks */
-	xorq	%r8,%r8
-	xorq	%r9,%r9
-	xorq	%r10,%r10
-	movl	EFLAGS(%rsp),%r11d	/* User eflags */
-	/*CFI_RESTORE rflags*/
-	TRACE_IRQS_ON
-
-	/*
-	 * SYSRETL works even on Intel CPUs.  Use it in preference to SYSEXIT,
-	 * since it avoids a dicey window with interrupts enabled.
-	 */
-	movl	RSP(%rsp),%esp
-
-	/*
-	 * USERGS_SYSRET32 does:
-	 *  gsbase = user's gs base
-	 *  eip = ecx
-	 *  rflags = r11
-	 *  cs = __USER32_CS
-	 *  ss = __USER_DS
-	 *
-	 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
-	 *
-	 *  pop %ebp
-	 *  pop %edx
-	 *  pop %ecx
-	 *
-	 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
-	 * avoid info leaks.  R11 ends up with VDSO32_SYSENTER_RETURN's
-	 * address (already known to user code), and R12-R15 are
-	 * callee-saved and therefore don't contain any interesting
-	 * kernel data.
-	 */
-	USERGS_SYSRET32
-
-	CFI_RESTORE_STATE
-
-#ifdef CONFIG_AUDITSYSCALL
-	.macro auditsys_entry_common
-	movl %esi,%r8d			/* 5th arg: 4th syscall arg */
-	movl %ecx,%r9d			/*swap with edx*/
-	movl %edx,%ecx			/* 4th arg: 3rd syscall arg */
-	movl %r9d,%edx			/* 3rd arg: 2nd syscall arg */
-	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
-	movl %eax,%edi			/* 1st arg: syscall number */
-	call __audit_syscall_entry
-	movl RAX(%rsp),%eax	/* reload syscall number */
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
-	movl %ebx,%edi			/* reload 1st syscall arg */
-	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
-	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
-	movl RSI(%rsp),%ecx	/* reload 4th syscall arg */
-	movl RDI(%rsp),%r8d	/* reload 5th syscall arg */
-	.endm
-
-	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz ia32_ret_from_sys_call
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movl %eax,%esi		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	jbe 1f
-	movslq %eax, %rsi	/* if error sign extend to 64 bits */
-1:	setbe %al		/* 1 if error, 0 if not */
-	movzbl %al,%edi		/* zero-extend that into %edi */
-	call __audit_syscall_exit
-	movq RAX(%rsp),%rax	/* reload syscall return value */
-	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz \exit
-	CLEAR_RREGS
-	jmp int_with_check
-	.endm
-
-sysenter_auditsys:
-	auditsys_entry_common
-	movl %ebp,%r9d			/* reload 6th syscall arg */
-	jmp sysenter_dispatch
-
-sysexit_audit:
-	auditsys_exit sysexit_from_sys_call
-#endif
-
-sysenter_fix_flags:
-	pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
-	popfq_cfi
-	jmp sysenter_flags_fixed
-
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz	sysenter_auditsys
-#endif
-	SAVE_EXTRA_REGS
-	CLEAR_RREGS
-	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
-	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
-	call	syscall_trace_enter
-	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
-	RESTORE_EXTRA_REGS
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
-	jmp	sysenter_do_call
-	CFI_ENDPROC
-ENDPROC(ia32_sysenter_target)
-
-/*
- * 32bit SYSCALL instruction entry.
- *
- * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
- *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
- * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
- * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
- * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
- *
- * Arguments:
- * eax  system call number
- * ecx  return address
- * ebx  arg1
- * ebp  arg2	(note: not saved in the stack frame, should not be touched)
- * edx  arg3
- * esi  arg4
- * edi  arg5
- * esp  user stack
- * 0(%esp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_cstar_target)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,0
-	CFI_REGISTER	rip,rcx
-	/*CFI_REGISTER	rflags,r11*/
-
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	SWAPGS_UNSAFE_STACK
-	movl	%esp,%r8d
-	CFI_REGISTER	rsp,r8
-	movq	PER_CPU_VAR(kernel_stack),%rsp
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax,%eax
-
-	/* Construct struct pt_regs on stack */
-	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
-	pushq_cfi	%r8			/* pt_regs->sp */
-	CFI_REL_OFFSET rsp,0
-	pushq_cfi	%r11			/* pt_regs->flags */
-	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
-	pushq_cfi	%rcx			/* pt_regs->ip */
-	CFI_REL_OFFSET rip,0
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rbp			/* pt_regs->cx */
-	movl	%ebp,%ecx
-	pushq_cfi_reg	rax			/* pt_regs->ax */
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
-
-	/*
-	 * no need to do an access_ok check here because r8 has been
-	 * 32bit zero extended
-	 */
-	ASM_STAC
-1:	movl	(%r8),%r9d
-	_ASM_EXTABLE(1b,ia32_badarg)
-	ASM_CLAC
-	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	CFI_REMEMBER_STATE
-	jnz   cstar_tracesys
-	cmpq $IA32_NR_syscalls-1,%rax
-	ja  ia32_badsys
-cstar_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl	%edi,%r8d	/* arg5 */
-	/* r9 already loaded */	/* arg6 */
-	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl	%ebx,%edi	/* arg1 */
-	movl	%edx,%edx	/* arg3 (zero extension) */
-cstar_dispatch:
-	call *ia32_sys_call_table(,%rax,8)
-	movq %rax,RAX(%rsp)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz sysretl_audit
-sysretl_from_sys_call:
-	andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	RESTORE_RSI_RDI_RDX
-	movl RIP(%rsp),%ecx
-	CFI_REGISTER rip,rcx
-	movl EFLAGS(%rsp),%r11d
-	/*CFI_REGISTER rflags,r11*/
-	xorq	%r10,%r10
-	xorq	%r9,%r9
-	xorq	%r8,%r8
-	TRACE_IRQS_ON
-	movl RSP(%rsp),%esp
-	CFI_RESTORE rsp
-	/*
-	 * 64bit->32bit SYSRET restores eip from ecx,
-	 * eflags from r11 (but RF and VM bits are forced to 0),
-	 * cs and ss are loaded from MSRs.
-	 * (Note: 32bit->32bit SYSRET is different: since r11
-	 * does not exist, it merely sets eflags.IF=1).
-	 *
-	 * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
-	 * descriptor is not reinitialized.  This means that we must
-	 * avoid SYSRET with SS == NULL, which could happen if we schedule,
-	 * exit the kernel, and re-enter using an interrupt vector.  (All
-	 * interrupt entries on x86_64 set SS to NULL.)  We prevent that
-	 * from happening by reloading SS in __switch_to.
-	 */
-	USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
-cstar_auditsys:
-	CFI_RESTORE_STATE
-	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
-	auditsys_entry_common
-	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
-	jmp cstar_dispatch
-
-sysretl_audit:
-	auditsys_exit sysretl_from_sys_call
-#endif
-
-cstar_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jz cstar_auditsys
-#endif
-	xchgl %r9d,%ebp
-	SAVE_EXTRA_REGS
-	CLEAR_RREGS r9
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
-	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace_enter
-	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
-	RESTORE_EXTRA_REGS
-	xchgl %ebp,%r9d
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
-	jmp cstar_do_call
-END(ia32_cstar_target)
-				
-ia32_badarg:
-	ASM_CLAC
-	movq $-EFAULT,%rax
-	jmp ia32_sysret
-	CFI_ENDPROC
-
-/*
- * Emulated IA32 system calls via int 0x80.
- *
- * Arguments:
- * eax  system call number
- * ebx  arg1
- * ecx  arg2
- * edx  arg3
- * esi  arg4
- * edi  arg5
- * ebp  arg6	(note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
-
-ENTRY(ia32_syscall)
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,5*8
-	/*CFI_REL_OFFSET	ss,4*8 */
-	CFI_REL_OFFSET	rsp,3*8
-	/*CFI_REL_OFFSET	rflags,2*8 */
-	/*CFI_REL_OFFSET	cs,1*8 */
-	CFI_REL_OFFSET	rip,0*8
-
-	/*
-	 * Interrupts are off on entry.
-	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
-	 * it is too small to ever cause noticeable irq latency.
-	 */
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	SWAPGS
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-	/* Zero-extending 32-bit regs, do not remove */
-	movl	%eax,%eax
-
-	/* Construct struct pt_regs on stack (iret frame is already on stack) */
-	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
-	pushq_cfi_reg	rdi			/* pt_regs->di */
-	pushq_cfi_reg	rsi			/* pt_regs->si */
-	pushq_cfi_reg	rdx			/* pt_regs->dx */
-	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
-	cld
-	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
-	CFI_ADJUST_CFA_OFFSET 10*8
-
-	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz ia32_tracesys
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
-ia32_do_call:
-	/* 32bit syscall -> 64bit C ABI argument conversion */
-	movl %edi,%r8d	/* arg5 */
-	movl %ebp,%r9d	/* arg6 */
-	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
-	movl %ebx,%edi	/* arg1 */
-	movl %edx,%edx	/* arg3 (zero extension) */
-	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
-ia32_sysret:
-	movq %rax,RAX(%rsp)
-ia32_ret_from_sys_call:
-	CLEAR_RREGS
-	jmp int_ret_from_sys_call
-
-ia32_tracesys:
-	SAVE_EXTRA_REGS
-	CLEAR_RREGS
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
-	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace_enter
-	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
-	RESTORE_EXTRA_REGS
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
-	jmp ia32_do_call
-END(ia32_syscall)
-
-ia32_badsys:
-	movq $0,ORIG_RAX(%rsp)
-	movq $-ENOSYS,%rax
-	jmp ia32_sysret
-
-	CFI_ENDPROC
-	
-	.macro PTREGSCALL label, func
-	ALIGN
-GLOBAL(\label)
-	leaq \func(%rip),%rax
-	jmp  ia32_ptregs_common	
-	.endm
-
-	CFI_STARTPROC32
-
-	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
-	PTREGSCALL stub32_sigreturn, sys32_sigreturn
-	PTREGSCALL stub32_fork, sys_fork
-	PTREGSCALL stub32_vfork, sys_vfork
-
-	ALIGN
-GLOBAL(stub32_clone)
-	leaq sys_clone(%rip),%rax
-	mov	%r8, %rcx
-	jmp  ia32_ptregs_common	
-
-	ALIGN
-ia32_ptregs_common:
-	CFI_ENDPROC
-	CFI_STARTPROC32	simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SIZEOF_PTREGS
-	CFI_REL_OFFSET	rax,RAX
-	CFI_REL_OFFSET	rcx,RCX
-	CFI_REL_OFFSET	rdx,RDX
-	CFI_REL_OFFSET	rsi,RSI
-	CFI_REL_OFFSET	rdi,RDI
-	CFI_REL_OFFSET	rip,RIP
-/*	CFI_REL_OFFSET	cs,CS*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS*/
-	CFI_REL_OFFSET	rsp,RSP
-/*	CFI_REL_OFFSET	ss,SS*/
-	SAVE_EXTRA_REGS 8
-	call *%rax
-	RESTORE_EXTRA_REGS 8
-	ret
-	CFI_ENDPROC
-END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index bdf02eeee765..e7636bac7372 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,6 +18,12 @@
 	.endm
 #endif
 
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
 .macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \alt - .
@@ -27,6 +33,12 @@
 	.byte \pad_len
 .endm
 
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
 .macro ALTERNATIVE oldinstr, newinstr, feature
 140:
 	\oldinstr
@@ -55,6 +67,12 @@
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
 .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
 140:
 	\oldinstr
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index ba32af062f61..7bfc85bbb8ff 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -52,6 +52,12 @@ struct alt_instr {
 	u8  padlen;		/* length of build-time padding */
 } __packed;
 
+/*
+ * Debug flag that can be tested to see whether alternative
+ * instructions were patched in already:
+ */
+extern int alternatives_patched;
+
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
 
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index aaac3b2fb746..1a5da2e63aee 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -98,11 +98,22 @@ static inline u16 amd_get_node_id(struct pci_dev *pdev)
 	return 0;
 }
 
+static inline bool amd_gart_present(void)
+{
+	/* GART present only on Fam15h, upto model 0fh */
+	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+	    (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+		return true;
+
+	return false;
+}
+
 #else
 
 #define amd_nb_num(x)		0
 #define amd_nb_has_feature(x)	false
 #define node_to_amd_nb(x)	NULL
+#define amd_gart_present(x)	false
 
 #endif
 
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 976b86a325e5..c8393634ca0c 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -644,6 +644,12 @@ static inline void entering_ack_irq(void)
 	entering_irq();
 }
 
+static inline void ipi_entering_ack_irq(void)
+{
+	ack_APIC_irq();
+	irq_enter();
+}
+
 static inline void exiting_irq(void)
 {
 	irq_exit();
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 7730c1c5c83a..189679aba703 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -63,6 +63,31 @@
 	_ASM_ALIGN ;						\
 	_ASM_PTR (entry);					\
 	.popsection
+
+.macro ALIGN_DESTINATION
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+100:	movb (%rsi),%al
+101:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	.section .fixup,"ax"
+103:	addl %ecx,%edx			/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
+	.endm
+
 #else
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 5e5cd123fdfb..e9168955c42f 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -22,7 +22,7 @@
  *
  * Atomically reads the value of @v.
  */
-static inline int atomic_read(const atomic_t *v)
+static __always_inline int atomic_read(const atomic_t *v)
 {
 	return ACCESS_ONCE((v)->counter);
 }
@@ -34,7 +34,7 @@ static inline int atomic_read(const atomic_t *v)
  *
  * Atomically sets the value of @v to @i.
  */
-static inline void atomic_set(atomic_t *v, int i)
+static __always_inline void atomic_set(atomic_t *v, int i)
 {
 	v->counter = i;
 }
@@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void atomic_add(int i, atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "addl %1,%0"
 		     : "+m" (v->counter)
@@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v)
  *
  * Atomically subtracts @i from @v.
  */
-static inline void atomic_sub(int i, atomic_t *v)
+static __always_inline void atomic_sub(int i, atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "subl %1,%0"
 		     : "+m" (v->counter)
@@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v)
  * true if the result is zero, or false for all
  * other cases.
  */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
 }
@@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
  *
  * Atomically increments @v by 1.
  */
-static inline void atomic_inc(atomic_t *v)
+static __always_inline void atomic_inc(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "incl %0"
 		     : "+m" (v->counter));
@@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v)
  *
  * Atomically decrements @v by 1.
  */
-static inline void atomic_dec(atomic_t *v)
+static __always_inline void atomic_dec(atomic_t *v)
 {
 	asm volatile(LOCK_PREFIX "decl %0"
 		     : "+m" (v->counter));
@@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v)
  * returns true if the result is 0, or false for all other
  * cases.
  */
-static inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline int atomic_dec_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
 }
@@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
  * and returns true if the result is zero, or false for all
  * other cases.
  */
-static inline int atomic_inc_and_test(atomic_t *v)
+static __always_inline int atomic_inc_and_test(atomic_t *v)
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
 }
@@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  */
-static inline int atomic_add_negative(int i, atomic_t *v)
+static __always_inline int atomic_add_negative(int i, atomic_t *v)
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s");
 }
@@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int atomic_add_return(int i, atomic_t *v)
 {
 	return i + xadd(&v->counter, i);
 }
@@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
  *
  * Atomically subtracts @i from @v and returns @v - @i
  */
-static inline int atomic_sub_return(int i, atomic_t *v)
+static __always_inline int atomic_sub_return(int i, atomic_t *v)
 {
 	return atomic_add_return(-i, v);
 }
@@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 #define atomic_inc_return(v)  (atomic_add_return(1, v))
 #define atomic_dec_return(v)  (atomic_sub_return(1, v))
 
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	return cmpxchg(&v->counter, old, new);
 }
@@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
  * Atomically adds @a to @v, so long as @v was not already @u.
  * Returns the old value of @v.
  */
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
@@ -213,7 +213,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  * Atomically adds 1 to @v
  * Returns the new value of @u
  */
-static inline short int atomic_inc_short(short int *v)
+static __always_inline short int atomic_inc_short(short int *v)
 {
 	asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
 	return *v;
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index f8d273e18516..b965f9e03f2a 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i)
  *
  * Atomically adds @i to @v.
  */
-static inline void atomic64_add(long i, atomic64_t *v)
+static __always_inline void atomic64_add(long i, atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "addq %1,%0"
 		     : "=m" (v->counter)
@@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v)
  *
  * Atomically increments @v by 1.
  */
-static inline void atomic64_inc(atomic64_t *v)
+static __always_inline void atomic64_inc(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "incq %0"
 		     : "=m" (v->counter)
@@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v)
  *
  * Atomically decrements @v by 1.
  */
-static inline void atomic64_dec(atomic64_t *v)
+static __always_inline void atomic64_dec(atomic64_t *v)
 {
 	asm volatile(LOCK_PREFIX "decq %0"
 		     : "=m" (v->counter)
@@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v)
  *
  * Atomically adds @i to @v and returns @i + @v
  */
-static inline long atomic64_add_return(long i, atomic64_t *v)
+static __always_inline long atomic64_add_return(long i, atomic64_t *v)
 {
 	return i + xadd(&v->counter, i);
 }
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 959e45b81fe2..e51a8f803f55 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -35,12 +35,12 @@
 #define smp_mb()	mb()
 #define smp_rmb()	dma_rmb()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
 #else /* !SMP */
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 #endif /* SMP */
 
 #define read_barrier_depends()		do { } while (0)
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 47c8e32f621a..b6f7457d12e4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -8,7 +8,7 @@
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
  * address range. The attributes include:
- * Cachability   : UnCached, WriteCombining, WriteBack
+ * Cachability   : UnCached, WriteCombining, WriteThrough, WriteBack
  * Executability : eXeutable, NoteXecutable
  * Read/Write    : ReadOnly, ReadWrite
  * Presence      : NotPresent
@@ -35,9 +35,11 @@
 
 int _set_memory_uc(unsigned long addr, int numpages);
 int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wt(unsigned long addr, int numpages);
 int _set_memory_wb(unsigned long addr, int numpages);
 int set_memory_uc(unsigned long addr, int numpages);
 int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wt(unsigned long addr, int numpages);
 int set_memory_wb(unsigned long addr, int numpages);
 int set_memory_x(unsigned long addr, int numpages);
 int set_memory_nx(unsigned long addr, int numpages);
@@ -48,10 +50,12 @@ int set_memory_4k(unsigned long addr, int numpages);
 
 int set_memory_array_uc(unsigned long *addr, int addrinarray);
 int set_memory_array_wc(unsigned long *addr, int addrinarray);
+int set_memory_array_wt(unsigned long *addr, int addrinarray);
 int set_memory_array_wb(unsigned long *addr, int addrinarray);
 
 int set_pages_array_uc(struct page **pages, int addrinarray);
 int set_pages_array_wc(struct page **pages, int addrinarray);
+int set_pages_array_wt(struct page **pages, int addrinarray);
 int set_pages_array_wb(struct page **pages, int addrinarray);
 
 /*
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 99c105d78b7e..ad19841eddfe 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,8 +4,6 @@
 #include <linux/compiler.h>
 #include <asm/alternative.h> /* Provides LOCK_PREFIX */
 
-#define __HAVE_ARCH_CMPXCHG 1
-
 /*
  * Non-existant functions to indicate usage errors at link time
  * (or compile-time if the compiler implements __compiletime_error().
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 1eef55596e82..03bb1065c335 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -7,7 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/crypto.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <crypto/b128ops.h>
 
 typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 808dae63eeea..1f5b7287d1ad 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -127,50 +127,14 @@ static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
 
 #define dma_alloc_coherent(d,s,h,f)	dma_alloc_attrs(d,s,h,f,NULL)
 
-static inline void *
+void *
 dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t gfp, struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-	void *memory;
-
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
-		return memory;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	if (!is_device_dma_capable(dev))
-		return NULL;
-
-	if (!ops->alloc)
-		return NULL;
-
-	memory = ops->alloc(dev, size, dma_handle,
-			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
-	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
-
-	return memory;
-}
+		gfp_t gfp, struct dma_attrs *attrs);
 
 #define dma_free_coherent(d,s,c,h) dma_free_attrs(d,s,c,h,NULL)
 
-static inline void dma_free_attrs(struct device *dev, size_t size,
-				  void *vaddr, dma_addr_t bus,
-				  struct dma_attrs *attrs)
-{
-	struct dma_map_ops *ops = get_dma_ops(dev);
-
-	WARN_ON(irqs_disabled());       /* for portability */
-
-	if (dma_release_from_coherent(dev, get_order(size), vaddr))
-		return;
-
-	debug_dma_free_coherent(dev, size, vaddr, bus);
-	if (ops->free)
-		ops->free(dev, size, vaddr, bus, attrs);
-}
+void dma_free_attrs(struct device *dev, size_t size,
+		    void *vaddr, dma_addr_t bus,
+		    struct dma_attrs *attrs);
 
 #endif
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
deleted file mode 100644
index de1cdaf4d743..000000000000
--- a/arch/x86/include/asm/dwarf2.h
+++ /dev/null
@@ -1,170 +0,0 @@
-#ifndef _ASM_X86_DWARF2_H
-#define _ASM_X86_DWARF2_H
-
-#ifndef __ASSEMBLY__
-#warning "asm/dwarf2.h should be only included in pure assembly files"
-#endif
-
-/*
- * Macros for dwarf2 CFI unwind table entries.
- * See "as.info" for details on these pseudo ops. Unfortunately
- * they are only supported in very new binutils, so define them
- * away for older version.
- */
-
-#ifdef CONFIG_AS_CFI
-
-#define CFI_STARTPROC		.cfi_startproc
-#define CFI_ENDPROC		.cfi_endproc
-#define CFI_DEF_CFA		.cfi_def_cfa
-#define CFI_DEF_CFA_REGISTER	.cfi_def_cfa_register
-#define CFI_DEF_CFA_OFFSET	.cfi_def_cfa_offset
-#define CFI_ADJUST_CFA_OFFSET	.cfi_adjust_cfa_offset
-#define CFI_OFFSET		.cfi_offset
-#define CFI_REL_OFFSET		.cfi_rel_offset
-#define CFI_REGISTER		.cfi_register
-#define CFI_RESTORE		.cfi_restore
-#define CFI_REMEMBER_STATE	.cfi_remember_state
-#define CFI_RESTORE_STATE	.cfi_restore_state
-#define CFI_UNDEFINED		.cfi_undefined
-#define CFI_ESCAPE		.cfi_escape
-
-#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
-#define CFI_SIGNAL_FRAME	.cfi_signal_frame
-#else
-#define CFI_SIGNAL_FRAME
-#endif
-
-#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
-	/*
-	 * Emit CFI data in .debug_frame sections, not .eh_frame sections.
-	 * The latter we currently just discard since we don't do DWARF
-	 * unwinding at runtime.  So only the offline DWARF information is
-	 * useful to anyone.  Note we should not use this directive if this
-	 * file is used in the vDSO assembly, or if vmlinux.lds.S gets
-	 * changed so it doesn't discard .eh_frame.
-	 */
-	.cfi_sections .debug_frame
-#endif
-
-#else
-
-/*
- * Due to the structure of pre-exisiting code, don't use assembler line
- * comment character # to ignore the arguments. Instead, use a dummy macro.
- */
-.macro cfi_ignore a=0, b=0, c=0, d=0
-.endm
-
-#define CFI_STARTPROC		cfi_ignore
-#define CFI_ENDPROC		cfi_ignore
-#define CFI_DEF_CFA		cfi_ignore
-#define CFI_DEF_CFA_REGISTER	cfi_ignore
-#define CFI_DEF_CFA_OFFSET	cfi_ignore
-#define CFI_ADJUST_CFA_OFFSET	cfi_ignore
-#define CFI_OFFSET		cfi_ignore
-#define CFI_REL_OFFSET		cfi_ignore
-#define CFI_REGISTER		cfi_ignore
-#define CFI_RESTORE		cfi_ignore
-#define CFI_REMEMBER_STATE	cfi_ignore
-#define CFI_RESTORE_STATE	cfi_ignore
-#define CFI_UNDEFINED		cfi_ignore
-#define CFI_ESCAPE		cfi_ignore
-#define CFI_SIGNAL_FRAME	cfi_ignore
-
-#endif
-
-/*
- * An attempt to make CFI annotations more or less
- * correct and shorter. It is implied that you know
- * what you're doing if you use them.
- */
-#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_64
-	.macro pushq_cfi reg
-	pushq \reg
-	CFI_ADJUST_CFA_OFFSET 8
-	.endm
-
-	.macro pushq_cfi_reg reg
-	pushq %\reg
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET \reg, 0
-	.endm
-
-	.macro popq_cfi reg
-	popq \reg
-	CFI_ADJUST_CFA_OFFSET -8
-	.endm
-
-	.macro popq_cfi_reg reg
-	popq %\reg
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE \reg
-	.endm
-
-	.macro pushfq_cfi
-	pushfq
-	CFI_ADJUST_CFA_OFFSET 8
-	.endm
-
-	.macro popfq_cfi
-	popfq
-	CFI_ADJUST_CFA_OFFSET -8
-	.endm
-
-	.macro movq_cfi reg offset=0
-	movq %\reg, \offset(%rsp)
-	CFI_REL_OFFSET \reg, \offset
-	.endm
-
-	.macro movq_cfi_restore offset reg
-	movq \offset(%rsp), %\reg
-	CFI_RESTORE \reg
-	.endm
-#else /*!CONFIG_X86_64*/
-	.macro pushl_cfi reg
-	pushl \reg
-	CFI_ADJUST_CFA_OFFSET 4
-	.endm
-
-	.macro pushl_cfi_reg reg
-	pushl %\reg
-	CFI_ADJUST_CFA_OFFSET 4
-	CFI_REL_OFFSET \reg, 0
-	.endm
-
-	.macro popl_cfi reg
-	popl \reg
-	CFI_ADJUST_CFA_OFFSET -4
-	.endm
-
-	.macro popl_cfi_reg reg
-	popl %\reg
-	CFI_ADJUST_CFA_OFFSET -4
-	CFI_RESTORE \reg
-	.endm
-
-	.macro pushfl_cfi
-	pushfl
-	CFI_ADJUST_CFA_OFFSET 4
-	.endm
-
-	.macro popfl_cfi
-	popfl
-	CFI_ADJUST_CFA_OFFSET -4
-	.endm
-
-	.macro movl_cfi reg offset=0
-	movl %\reg, \offset(%esp)
-	CFI_REL_OFFSET \reg, \offset
-	.endm
-
-	.macro movl_cfi_restore offset reg
-	movl \offset(%esp), %\reg
-	CFI_RESTORE \reg
-	.endm
-#endif /*!CONFIG_X86_64*/
-#endif /*__ASSEMBLY__*/
-
-#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
index e9b57ecc70c5..cf8fdf83b231 100644
--- a/arch/x86/include/asm/edac.h
+++ b/arch/x86/include/asm/edac.h
@@ -3,7 +3,7 @@
 
 /* ECC atomic, DMA, SMP and interrupt safe scrub function */
 
-static inline void atomic_scrub(void *va, u32 size)
+static inline void edac_atomic_scrub(void *va, u32 size)
 {
 	u32 i, *virt_addr = va;
 
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 3738b138b843..155162ea0e00 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_EFI_H
 #define _ASM_X86_EFI_H
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/pgtable.h>
 
 /*
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index dc5fa661465f..df002992d8fd 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 #ifdef CONFIG_HAVE_KVM
 BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
 		 smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+		 smp_kvm_posted_intr_wakeup_ipi)
 #endif
 
 /*
@@ -50,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
+#endif
 #endif
diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
deleted file mode 100644
index da5e96756570..000000000000
--- a/arch/x86/include/asm/fpu-internal.h
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _FPU_INTERNAL_H
-#define _FPU_INTERNAL_H
-
-#include <linux/kernel_stat.h>
-#include <linux/regset.h>
-#include <linux/compat.h>
-#include <linux/slab.h>
-#include <asm/asm.h>
-#include <asm/cpufeature.h>
-#include <asm/processor.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/uaccess.h>
-#include <asm/xsave.h>
-#include <asm/smap.h>
-
-#ifdef CONFIG_X86_64
-# include <asm/sigcontext32.h>
-# include <asm/user32.h>
-struct ksignal;
-int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
-			compat_sigset_t *set, struct pt_regs *regs);
-int ia32_setup_frame(int sig, struct ksignal *ksig,
-		     compat_sigset_t *set, struct pt_regs *regs);
-#else
-# define user_i387_ia32_struct	user_i387_struct
-# define user32_fxsr_struct	user_fxsr_struct
-# define ia32_setup_frame	__setup_frame
-# define ia32_setup_rt_frame	__setup_rt_frame
-#endif
-
-extern unsigned int mxcsr_feature_mask;
-extern void fpu_init(void);
-extern void eager_fpu_init(void);
-
-DECLARE_PER_CPU(struct task_struct *, fpu_owner_task);
-
-extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
-			      struct task_struct *tsk);
-extern void convert_to_fxsr(struct task_struct *tsk,
-			    const struct user_i387_ia32_struct *env);
-
-extern user_regset_active_fn fpregs_active, xfpregs_active;
-extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
-				xstateregs_get;
-extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
-				 xstateregs_set;
-
-/*
- * xstateregs_active == fpregs_active. Please refer to the comment
- * at the definition of fpregs_active.
- */
-#define xstateregs_active	fpregs_active
-
-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-	per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-/*
- * Used to indicate that the FPU state in memory is newer than the FPU
- * state in registers, and the FPU state should be reloaded next time the
- * task is run. Only safe on the current task, or non-running tasks.
- */
-static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
-{
-	tsk->thread.fpu.last_cpu = ~0;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-	return new == this_cpu_read_stable(fpu_owner_task) &&
-		cpu == new->thread.fpu.last_cpu;
-}
-
-static inline int is_ia32_compat_frame(void)
-{
-	return config_enabled(CONFIG_IA32_EMULATION) &&
-	       test_thread_flag(TIF_IA32);
-}
-
-static inline int is_ia32_frame(void)
-{
-	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
-}
-
-static inline int is_x32_frame(void)
-{
-	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
-}
-
-#define X87_FSW_ES (1 << 7)	/* Exception Summary */
-
-static __always_inline __pure bool use_eager_fpu(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
-}
-
-static __always_inline __pure bool use_xsaveopt(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
-}
-
-static __always_inline __pure bool use_xsave(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_XSAVE);
-}
-
-static __always_inline __pure bool use_fxsr(void)
-{
-	return static_cpu_has_safe(X86_FEATURE_FXSR);
-}
-
-static inline void fx_finit(struct i387_fxsave_struct *fx)
-{
-	fx->cwd = 0x37f;
-	fx->mxcsr = MXCSR_DEFAULT;
-}
-
-extern void __sanitize_i387_state(struct task_struct *);
-
-static inline void sanitize_i387_state(struct task_struct *tsk)
-{
-	if (!use_xsaveopt())
-		return;
-	__sanitize_i387_state(tsk);
-}
-
-#define user_insn(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile(ASM_STAC "\n"					\
-		     "1:" #insn "\n\t"					\
-		     "2: " ASM_CLAC "\n"				\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-#define check_insn(insn, output, input...)				\
-({									\
-	int err;							\
-	asm volatile("1:" #insn "\n\t"					\
-		     "2:\n"						\
-		     ".section .fixup,\"ax\"\n"				\
-		     "3:  movl $-1,%[err]\n"				\
-		     "    jmp  2b\n"					\
-		     ".previous\n"					\
-		     _ASM_EXTABLE(1b, 3b)				\
-		     : [err] "=r" (err), output				\
-		     : "0"(0), input);					\
-	err;								\
-})
-
-static inline int fsave_user(struct i387_fsave_struct __user *fx)
-{
-	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
-}
-
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
-}
-
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-			  "m" (*fx));
-}
-
-static inline int fxrstor_user(struct i387_fxsave_struct __user *fx)
-{
-	if (config_enabled(CONFIG_X86_32))
-		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
-
-	/* See comment in fpu_fxsave() below. */
-	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
-			  "m" (*fx));
-}
-
-static inline int frstor_checking(struct i387_fsave_struct *fx)
-{
-	return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline int frstor_user(struct i387_fsave_struct __user *fx)
-{
-	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-}
-
-static inline void fpu_fxsave(struct fpu *fpu)
-{
-	if (config_enabled(CONFIG_X86_32))
-		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
-	else if (config_enabled(CONFIG_AS_FXSAVEQ))
-		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
-	else {
-		/* Using "rex64; fxsave %0" is broken because, if the memory
-		 * operand uses any extended registers for addressing, a second
-		 * REX prefix will be generated (to the assembler, rex64
-		 * followed by semicolon is a separate instruction), and hence
-		 * the 64-bitness is lost.
-		 *
-		 * Using "fxsaveq %0" would be the ideal choice, but is only
-		 * supported starting with gas 2.16.
-		 *
-		 * Using, as a workaround, the properly prefixed form below
-		 * isn't accepted by any binutils version so far released,
-		 * complaining that the same type of prefix is used twice if
-		 * an extended register is needed for addressing (fix submitted
-		 * to mainline 2005-11-21).
-		 *
-		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave));
-		 *
-		 * This, however, we can work around by forcing the compiler to
-		 * select an addressing mode that doesn't require extended
-		 * registers.
-		 */
-		asm volatile( "rex64/fxsave (%[fx])"
-			     : "=m" (fpu->state->fxsave)
-			     : [fx] "R" (&fpu->state->fxsave));
-	}
-}
-
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact.
- */
-static inline int fpu_save_init(struct fpu *fpu)
-{
-	if (use_xsave()) {
-		fpu_xsave(fpu);
-
-		/*
-		 * xsave header may indicate the init state of the FP.
-		 */
-		if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
-			return 1;
-	} else if (use_fxsr()) {
-		fpu_fxsave(fpu);
-	} else {
-		asm volatile("fnsave %[fx]; fwait"
-			     : [fx] "=m" (fpu->state->fsave));
-		return 0;
-	}
-
-	/*
-	 * If exceptions are pending, we need to clear them so
-	 * that we don't randomly get exceptions later.
-	 *
-	 * FIXME! Is this perhaps only true for the old-style
-	 * irq13 case? Maybe we could leave the x87 state
-	 * intact otherwise?
-	 */
-	if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES)) {
-		asm volatile("fnclex");
-		return 0;
-	}
-	return 1;
-}
-
-static inline int __save_init_fpu(struct task_struct *tsk)
-{
-	return fpu_save_init(&tsk->thread.fpu);
-}
-
-static inline int fpu_restore_checking(struct fpu *fpu)
-{
-	if (use_xsave())
-		return fpu_xrstor_checking(&fpu->state->xsave);
-	else if (use_fxsr())
-		return fxrstor_checking(&fpu->state->fxsave);
-	else
-		return frstor_checking(&fpu->state->fsave);
-}
-
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-	/*
-	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
-	 * pending. Clear the x87 state here by setting it to fixed values.
-	 * "m" is a random variable that should be in L1.
-	 */
-	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
-		asm volatile(
-			"fnclex\n\t"
-			"emms\n\t"
-			"fildl %P[addr]"	/* set F?P to defined value */
-			: : [addr] "m" (tsk->thread.fpu.has_fpu));
-	}
-
-	return fpu_restore_checking(&tsk->thread.fpu);
-}
-
-/*
- * Software FPU state helpers. Careful: these need to
- * be preemption protection *and* they need to be
- * properly paired with the CR0.TS changes!
- */
-static inline int __thread_has_fpu(struct task_struct *tsk)
-{
-	return tsk->thread.fpu.has_fpu;
-}
-
-/* Must be paired with an 'stts' after! */
-static inline void __thread_clear_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 0;
-	this_cpu_write(fpu_owner_task, NULL);
-}
-
-/* Must be paired with a 'clts' before! */
-static inline void __thread_set_has_fpu(struct task_struct *tsk)
-{
-	tsk->thread.fpu.has_fpu = 1;
-	this_cpu_write(fpu_owner_task, tsk);
-}
-
-/*
- * Encapsulate the CR0.TS handling together with the
- * software flag.
- *
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void __thread_fpu_end(struct task_struct *tsk)
-{
-	__thread_clear_has_fpu(tsk);
-	if (!use_eager_fpu())
-		stts();
-}
-
-static inline void __thread_fpu_begin(struct task_struct *tsk)
-{
-	if (!use_eager_fpu())
-		clts();
-	__thread_set_has_fpu(tsk);
-}
-
-static inline void drop_fpu(struct task_struct *tsk)
-{
-	/*
-	 * Forget coprocessor state..
-	 */
-	preempt_disable();
-	tsk->thread.fpu_counter = 0;
-
-	if (__thread_has_fpu(tsk)) {
-		/* Ignore delayed exceptions from user space */
-		asm volatile("1: fwait\n"
-			     "2:\n"
-			     _ASM_EXTABLE(1b, 2b));
-		__thread_fpu_end(tsk);
-	}
-
-	clear_stopped_child_used_math(tsk);
-	preempt_enable();
-}
-
-static inline void restore_init_xstate(void)
-{
-	if (use_xsave())
-		xrstor_state(init_xstate_buf, -1);
-	else
-		fxrstor_checking(&init_xstate_buf->i387);
-}
-
-/*
- * Reset the FPU state in the eager case and drop it in the lazy case (later use
- * will reinit it).
- */
-static inline void fpu_reset_state(struct task_struct *tsk)
-{
-	if (!use_eager_fpu())
-		drop_fpu(tsk);
-	else
-		restore_init_xstate();
-}
-
-/*
- * FPU state switching for scheduling.
- *
- * This is a two-stage process:
- *
- *  - switch_fpu_prepare() saves the old state and
- *    sets the new state of the CR0.TS bit. This is
- *    done within the context of the old process.
- *
- *  - switch_fpu_finish() restores the new state as
- *    necessary.
- */
-typedef struct { int preload; } fpu_switch_t;
-
-static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
-{
-	fpu_switch_t fpu;
-
-	/*
-	 * If the task has used the math, pre-load the FPU on xsave processors
-	 * or if the past 5 consecutive context-switches used math.
-	 */
-	fpu.preload = tsk_used_math(new) &&
-		      (use_eager_fpu() || new->thread.fpu_counter > 5);
-
-	if (__thread_has_fpu(old)) {
-		if (!__save_init_fpu(old))
-			task_disable_lazy_fpu_restore(old);
-		else
-			old->thread.fpu.last_cpu = cpu;
-
-		/* But leave fpu_owner_task! */
-		old->thread.fpu.has_fpu = 0;
-
-		/* Don't change CR0.TS if we just switch! */
-		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			__thread_set_has_fpu(new);
-			prefetch(new->thread.fpu.state);
-		} else if (!use_eager_fpu())
-			stts();
-	} else {
-		old->thread.fpu_counter = 0;
-		task_disable_lazy_fpu_restore(old);
-		if (fpu.preload) {
-			new->thread.fpu_counter++;
-			if (fpu_lazy_restore(new, cpu))
-				fpu.preload = 0;
-			else
-				prefetch(new->thread.fpu.state);
-			__thread_fpu_begin(new);
-		}
-	}
-	return fpu;
-}
-
-/*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
- */
-static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
-{
-	if (fpu.preload) {
-		if (unlikely(restore_fpu_checking(new)))
-			fpu_reset_state(new);
-	}
-}
-
-/*
- * Signal frame handlers...
- */
-extern int save_xstate_sig(void __user *buf, void __user *fx, int size);
-extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size);
-
-static inline int xstate_sigframe_size(void)
-{
-	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
-}
-
-static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
-{
-	void __user *buf_fx = buf;
-	int size = xstate_sigframe_size();
-
-	if (ia32_frame && use_fxsr()) {
-		buf_fx = buf + sizeof(struct i387_fsave_struct);
-		size += sizeof(struct i387_fsave_struct);
-	}
-
-	return __restore_xstate_sig(buf, buf_fx, size);
-}
-
-/*
- * Needs to be preemption-safe.
- *
- * NOTE! user_fpu_begin() must be used only immediately before restoring
- * the save state. It does not do any saving/restoring on its own. In
- * lazy FPU mode, it is just an optimization to avoid a #NM exception,
- * the task can lose the FPU right after preempt_enable().
- */
-static inline void user_fpu_begin(void)
-{
-	preempt_disable();
-	if (!user_has_fpu())
-		__thread_fpu_begin(current);
-	preempt_enable();
-}
-
-static inline void __save_fpu(struct task_struct *tsk)
-{
-	if (use_xsave()) {
-		if (unlikely(system_state == SYSTEM_BOOTING))
-			xsave_state_booting(&tsk->thread.fpu.state->xsave, -1);
-		else
-			xsave_state(&tsk->thread.fpu.state->xsave, -1);
-	} else
-		fpu_fxsave(&tsk->thread.fpu);
-}
-
-/*
- * i387 state interaction
- */
-static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.cwd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.cwd;
-	}
-}
-
-static inline unsigned short get_fpu_swd(struct task_struct *tsk)
-{
-	if (cpu_has_fxsr) {
-		return tsk->thread.fpu.state->fxsave.swd;
-	} else {
-		return (unsigned short)tsk->thread.fpu.state->fsave.swd;
-	}
-}
-
-static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
-{
-	if (cpu_has_xmm) {
-		return tsk->thread.fpu.state->fxsave.mxcsr;
-	} else {
-		return MXCSR_DEFAULT;
-	}
-}
-
-static bool fpu_allocated(struct fpu *fpu)
-{
-	return fpu->state != NULL;
-}
-
-static inline int fpu_alloc(struct fpu *fpu)
-{
-	if (fpu_allocated(fpu))
-		return 0;
-	fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
-	if (!fpu->state)
-		return -ENOMEM;
-	WARN_ON((unsigned long)fpu->state & 15);
-	return 0;
-}
-
-static inline void fpu_free(struct fpu *fpu)
-{
-	if (fpu->state) {
-		kmem_cache_free(task_xstate_cachep, fpu->state);
-		fpu->state = NULL;
-	}
-}
-
-static inline void fpu_copy(struct task_struct *dst, struct task_struct *src)
-{
-	if (use_eager_fpu()) {
-		memset(&dst->thread.fpu.state->xsave, 0, xstate_size);
-		__save_fpu(dst);
-	} else {
-		struct fpu *dfpu = &dst->thread.fpu;
-		struct fpu *sfpu = &src->thread.fpu;
-
-		unlazy_fpu(src);
-		memcpy(dfpu->state, sfpu->state, xstate_size);
-	}
-}
-
-static inline unsigned long
-alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx,
-		unsigned long *size)
-{
-	unsigned long frame_size = xstate_sigframe_size();
-
-	*buf_fx = sp = round_down(sp - frame_size, 64);
-	if (ia32_frame && use_fxsr()) {
-		frame_size += sizeof(struct i387_fsave_struct);
-		sp -= sizeof(struct i387_fsave_struct);
-	}
-
-	*size = frame_size;
-	return sp;
-}
-
-#endif
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
new file mode 100644
index 000000000000..1429a7c736db
--- /dev/null
+++ b/arch/x86/include/asm/fpu/api.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_API_H
+#define _ASM_X86_FPU_API_H
+
+/*
+ * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
+ * and they don't touch the preempt state on their own.
+ * If you enable preemption after __kernel_fpu_begin(), preempt notifier
+ * should call the __kernel_fpu_end() to prevent the kernel/user FPU
+ * state from getting corrupted. KVM for example uses this model.
+ *
+ * All other cases use kernel_fpu_begin/end() which disable preemption
+ * during kernel FPU usage.
+ */
+extern void __kernel_fpu_begin(void);
+extern void __kernel_fpu_end(void);
+extern void kernel_fpu_begin(void);
+extern void kernel_fpu_end(void);
+extern bool irq_fpu_usable(void);
+
+/*
+ * Some instructions like VIA's padlock instructions generate a spurious
+ * DNA fault but don't modify SSE registers. And these instructions
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
+ * should use them only in the context of irq_ts_save/restore()
+ */
+extern int  irq_ts_save(void);
+extern void irq_ts_restore(int TS_state);
+
+/*
+ * Query the presence of one or more xfeatures. Works on any legacy CPU as well.
+ *
+ * If 'feature_name' is set then put a human-readable description of
+ * the feature there as well - this can be used to print error (or success)
+ * messages.
+ */
+extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
+
+#endif /* _ASM_X86_FPU_API_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
new file mode 100644
index 000000000000..3c3550c3a4a3
--- /dev/null
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -0,0 +1,694 @@
+/*
+ * Copyright (C) 1994 Linus Torvalds
+ *
+ * Pentium III FXSR, SSE support
+ * General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ * x86-64 work by Andi Kleen 2002
+ */
+
+#ifndef _ASM_X86_FPU_INTERNAL_H
+#define _ASM_X86_FPU_INTERNAL_H
+
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/user.h>
+#include <asm/fpu/api.h>
+#include <asm/fpu/xstate.h>
+
+/*
+ * High level FPU state handling functions:
+ */
+extern void fpu__activate_curr(struct fpu *fpu);
+extern void fpu__activate_fpstate_read(struct fpu *fpu);
+extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__save(struct fpu *fpu);
+extern void fpu__restore(struct fpu *fpu);
+extern int  fpu__restore_sig(void __user *buf, int ia32_frame);
+extern void fpu__drop(struct fpu *fpu);
+extern int  fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu);
+extern void fpu__clear(struct fpu *fpu);
+extern int  fpu__exception_code(struct fpu *fpu, int trap_nr);
+extern int  dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
+
+/*
+ * Boot time FPU initialization functions:
+ */
+extern void fpu__init_cpu(void);
+extern void fpu__init_system_xstate(void);
+extern void fpu__init_cpu_xstate(void);
+extern void fpu__init_system(struct cpuinfo_x86 *c);
+extern void fpu__init_check_bugs(void);
+extern void fpu__resume_cpu(void);
+
+/*
+ * Debugging facility:
+ */
+#ifdef CONFIG_X86_DEBUG_FPU
+# define WARN_ON_FPU(x) WARN_ON_ONCE(x)
+#else
+# define WARN_ON_FPU(x) ({ (void)(x); 0; })
+#endif
+
+/*
+ * FPU related CPU feature flag helper routines:
+ */
+static __always_inline __pure bool use_eager_fpu(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_EAGER_FPU);
+}
+
+static __always_inline __pure bool use_xsaveopt(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_XSAVEOPT);
+}
+
+static __always_inline __pure bool use_xsave(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_XSAVE);
+}
+
+static __always_inline __pure bool use_fxsr(void)
+{
+	return static_cpu_has_safe(X86_FEATURE_FXSR);
+}
+
+/*
+ * fpstate handling functions:
+ */
+
+extern union fpregs_state init_fpstate;
+
+extern void fpstate_init(union fpregs_state *state);
+#ifdef CONFIG_MATH_EMULATION
+extern void fpstate_init_soft(struct swregs_state *soft);
+#else
+static inline void fpstate_init_soft(struct swregs_state *soft) {}
+#endif
+static inline void fpstate_init_fxstate(struct fxregs_state *fx)
+{
+	fx->cwd = 0x37f;
+	fx->mxcsr = MXCSR_DEFAULT;
+}
+extern void fpstate_sanitize_xstate(struct fpu *fpu);
+
+#define user_insn(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile(ASM_STAC "\n"					\
+		     "1:" #insn "\n\t"					\
+		     "2: " ASM_CLAC "\n"				\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+#define check_insn(insn, output, input...)				\
+({									\
+	int err;							\
+	asm volatile("1:" #insn "\n\t"					\
+		     "2:\n"						\
+		     ".section .fixup,\"ax\"\n"				\
+		     "3:  movl $-1,%[err]\n"				\
+		     "    jmp  2b\n"					\
+		     ".previous\n"					\
+		     _ASM_EXTABLE(1b, 3b)				\
+		     : [err] "=r" (err), output				\
+		     : "0"(0), input);					\
+	err;								\
+})
+
+static inline int copy_fregs_to_user(struct fregs_state __user *fx)
+{
+	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));
+}
+
+static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
+{
+	if (config_enabled(CONFIG_X86_32))
+		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
+
+	/* See comment in copy_fxregs_to_kernel() below. */
+	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
+}
+
+static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
+{
+	int err;
+
+	if (config_enabled(CONFIG_X86_32)) {
+		err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	} else {
+		if (config_enabled(CONFIG_AS_FXSAVEQ)) {
+			err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+		} else {
+			/* See comment in copy_fxregs_to_kernel() below. */
+			err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+		}
+	}
+	/* Copying from a kernel buffer to FPU registers should never fail: */
+	WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
+{
+	if (config_enabled(CONFIG_X86_32))
+		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+	/* See comment in copy_fxregs_to_kernel() below. */
+	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx),
+			  "m" (*fx));
+}
+
+static inline void copy_kernel_to_fregs(struct fregs_state *fx)
+{
+	int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+
+	WARN_ON_FPU(err);
+}
+
+static inline int copy_user_to_fregs(struct fregs_state __user *fx)
+{
+	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+}
+
+static inline void copy_fxregs_to_kernel(struct fpu *fpu)
+{
+	if (config_enabled(CONFIG_X86_32))
+		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+	else if (config_enabled(CONFIG_AS_FXSAVEQ))
+		asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+	else {
+		/* Using "rex64; fxsave %0" is broken because, if the memory
+		 * operand uses any extended registers for addressing, a second
+		 * REX prefix will be generated (to the assembler, rex64
+		 * followed by semicolon is a separate instruction), and hence
+		 * the 64-bitness is lost.
+		 *
+		 * Using "fxsaveq %0" would be the ideal choice, but is only
+		 * supported starting with gas 2.16.
+		 *
+		 * Using, as a workaround, the properly prefixed form below
+		 * isn't accepted by any binutils version so far released,
+		 * complaining that the same type of prefix is used twice if
+		 * an extended register is needed for addressing (fix submitted
+		 * to mainline 2005-11-21).
+		 *
+		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
+		 *
+		 * This, however, we can work around by forcing the compiler to
+		 * select an addressing mode that doesn't require extended
+		 * registers.
+		 */
+		asm volatile( "rex64/fxsave (%[fx])"
+			     : "=m" (fpu->state.fxsave)
+			     : [fx] "R" (&fpu->state.fxsave));
+	}
+}
+
+/* These macros all use (%edi)/(%rdi) as the single memory argument. */
+#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
+#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
+#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
+#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
+#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
+
+/* xstate instruction fault handler: */
+#define xstate_fault(__err)		\
+					\
+	".section .fixup,\"ax\"\n"	\
+					\
+	"3:  movl $-2,%[_err]\n"	\
+	"    jmp  2b\n"			\
+					\
+	".previous\n"			\
+					\
+	_ASM_EXTABLE(1b, 3b)		\
+	: [_err] "=r" (__err)
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XSAVES"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+	else
+		asm volatile("1:"XSAVE"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * This function is called only during boot time when x86 caps are not set
+ * up and alternative can not be used yet.
+ */
+static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(system_state != SYSTEM_BOOTING);
+
+	if (boot_cpu_has(X86_FEATURE_XSAVES))
+		asm volatile("1:"XRSTORS"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+	else
+		asm volatile("1:"XRSTOR"\n\t"
+			"2:\n\t"
+			     xstate_fault(err)
+			: "D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask), "0" (err)
+			: "memory");
+
+	/* We should never fault when copying from a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Save processor xstate to xsave area.
+ */
+static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
+{
+	u64 mask = -1;
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	WARN_ON(!alternatives_patched);
+
+	/*
+	 * If xsaves is enabled, xsaves replaces xsaveopt because
+	 * it supports compact format and supervisor states in addition to
+	 * modified optimization in xsaveopt.
+	 *
+	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
+	 * because xsaveopt supports modified optimization which is not
+	 * supported by xsave.
+	 *
+	 * If none of xsaves and xsaveopt is enabled, use xsave.
+	 */
+	alternative_input_2(
+		"1:"XSAVE,
+		XSAVEOPT,
+		X86_FEATURE_XSAVEOPT,
+		XSAVES,
+		X86_FEATURE_XSAVES,
+		[xstate] "D" (xstate), "a" (lmask), "d" (hmask) :
+		"memory");
+	asm volatile("2:\n\t"
+		     xstate_fault(err)
+		     : "0" (err)
+		     : "memory");
+
+	/* We should never fault when copying to a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Restore processor xstate from xsave area.
+ */
+static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
+{
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	/*
+	 * Use xrstors to restore context if it is enabled. xrstors supports
+	 * compacted format of xsave area which is not supported by xrstor.
+	 */
+	alternative_input(
+		"1: " XRSTOR,
+		XRSTORS,
+		X86_FEATURE_XSAVES,
+		"D" (xstate), "m" (*xstate), "a" (lmask), "d" (hmask)
+		: "memory");
+
+	asm volatile("2:\n"
+		     xstate_fault(err)
+		     : "0" (err)
+		     : "memory");
+
+	/* We should never fault when copying from a kernel buffer: */
+	WARN_ON_FPU(err);
+}
+
+/*
+ * Save xstate to user space xsave area.
+ *
+ * We don't use modified optimization because xrstor/xrstors might track
+ * a different application.
+ *
+ * We don't use compacted format xsave area for
+ * backward compatibility for old applications which don't understand
+ * compacted format of xsave area.
+ */
+static inline int copy_xregs_to_user(struct xregs_state __user *buf)
+{
+	int err;
+
+	/*
+	 * Clear the xsave header first, so that reserved fields are
+	 * initialized to zero.
+	 */
+	err = __clear_user(&buf->header, sizeof(buf->header));
+	if (unlikely(err))
+		return -EFAULT;
+
+	__asm__ __volatile__(ASM_STAC "\n"
+			     "1:"XSAVE"\n"
+			     "2: " ASM_CLAC "\n"
+			     xstate_fault(err)
+			     : "D" (buf), "a" (-1), "d" (-1), "0" (err)
+			     : "memory");
+	return err;
+}
+
+/*
+ * Restore xstate from user space xsave area.
+ */
+static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
+{
+	struct xregs_state *xstate = ((__force struct xregs_state *)buf);
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
+	int err = 0;
+
+	__asm__ __volatile__(ASM_STAC "\n"
+			     "1:"XRSTOR"\n"
+			     "2: " ASM_CLAC "\n"
+			     xstate_fault(err)
+			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (err)
+			     : "memory");	/* memory required? */
+	return err;
+}
+
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		copy_xregs_to_kernel(&fpu->state.xsave);
+		return 1;
+	}
+
+	if (likely(use_fxsr())) {
+		copy_fxregs_to_kernel(fpu);
+		return 1;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to mark them inactive:
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+	return 0;
+}
+
+static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+	if (use_xsave()) {
+		copy_kernel_to_xregs(&fpstate->xsave, -1);
+	} else {
+		if (use_fxsr())
+			copy_kernel_to_fxregs(&fpstate->fxsave);
+		else
+			copy_kernel_to_fregs(&fpstate->fsave);
+	}
+}
+
+static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate)
+{
+	/*
+	 * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is
+	 * pending. Clear the x87 state here by setting it to fixed values.
+	 * "m" is a random variable that should be in L1.
+	 */
+	if (unlikely(static_cpu_has_bug_safe(X86_BUG_FXSAVE_LEAK))) {
+		asm volatile(
+			"fnclex\n\t"
+			"emms\n\t"
+			"fildl %P[addr]"	/* set F?P to defined value */
+			: : [addr] "m" (fpstate));
+	}
+
+	__copy_kernel_to_fpregs(fpstate);
+}
+
+extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
+
+/*
+ * FPU context switch related helper methods:
+ */
+
+DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+/*
+ * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+}
+
+static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu)
+{
+	return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
+}
+
+
+/*
+ * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
+ * idiom, which is then paired with the sw-flag (fpregs_active) later on:
+ */
+
+static inline void __fpregs_activate_hw(void)
+{
+	if (!use_eager_fpu())
+		clts();
+}
+
+static inline void __fpregs_deactivate_hw(void)
+{
+	if (!use_eager_fpu())
+		stts();
+}
+
+/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
+static inline void __fpregs_deactivate(struct fpu *fpu)
+{
+	WARN_ON_FPU(!fpu->fpregs_active);
+
+	fpu->fpregs_active = 0;
+	this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+}
+
+/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */
+static inline void __fpregs_activate(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu->fpregs_active);
+
+	fpu->fpregs_active = 1;
+	this_cpu_write(fpu_fpregs_owner_ctx, fpu);
+}
+
+/*
+ * The question "does this thread have fpu access?"
+ * is slightly racy, since preemption could come in
+ * and revoke it immediately after the test.
+ *
+ * However, even in that very unlikely scenario,
+ * we can just assume we have FPU access - typically
+ * to save the FP state - we'll just take a #NM
+ * fault and get the FPU access back.
+ */
+static inline int fpregs_active(void)
+{
+	return current->thread.fpu.fpregs_active;
+}
+
+/*
+ * Encapsulate the CR0.TS handling together with the
+ * software flag.
+ *
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own.
+ */
+static inline void fpregs_activate(struct fpu *fpu)
+{
+	__fpregs_activate_hw();
+	__fpregs_activate(fpu);
+}
+
+static inline void fpregs_deactivate(struct fpu *fpu)
+{
+	__fpregs_deactivate(fpu);
+	__fpregs_deactivate_hw();
+}
+
+/*
+ * FPU state switching for scheduling.
+ *
+ * This is a two-stage process:
+ *
+ *  - switch_fpu_prepare() saves the old state and
+ *    sets the new state of the CR0.TS bit. This is
+ *    done within the context of the old process.
+ *
+ *  - switch_fpu_finish() restores the new state as
+ *    necessary.
+ */
+typedef struct { int preload; } fpu_switch_t;
+
+static inline fpu_switch_t
+switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
+{
+	fpu_switch_t fpu;
+
+	/*
+	 * If the task has used the math, pre-load the FPU on xsave processors
+	 * or if the past 5 consecutive context-switches used math.
+	 */
+	fpu.preload = new_fpu->fpstate_active &&
+		      (use_eager_fpu() || new_fpu->counter > 5);
+
+	if (old_fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(old_fpu))
+			old_fpu->last_cpu = -1;
+		else
+			old_fpu->last_cpu = cpu;
+
+		/* But leave fpu_fpregs_owner_ctx! */
+		old_fpu->fpregs_active = 0;
+
+		/* Don't change CR0.TS if we just switch! */
+		if (fpu.preload) {
+			new_fpu->counter++;
+			__fpregs_activate(new_fpu);
+			prefetch(&new_fpu->state);
+		} else {
+			__fpregs_deactivate_hw();
+		}
+	} else {
+		old_fpu->counter = 0;
+		old_fpu->last_cpu = -1;
+		if (fpu.preload) {
+			new_fpu->counter++;
+			if (fpu_want_lazy_restore(new_fpu, cpu))
+				fpu.preload = 0;
+			else
+				prefetch(&new_fpu->state);
+			fpregs_activate(new_fpu);
+		}
+	}
+	return fpu;
+}
+
+/*
+ * Misc helper functions:
+ */
+
+/*
+ * By the time this gets called, we've already cleared CR0.TS and
+ * given the process the FPU if we are going to preload the FPU
+ * state - all we need to do is to conditionally restore the register
+ * state itself.
+ */
+static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch)
+{
+	if (fpu_switch.preload)
+		copy_kernel_to_fpregs(&new_fpu->state);
+}
+
+/*
+ * Needs to be preemption-safe.
+ *
+ * NOTE! user_fpu_begin() must be used only immediately before restoring
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
+ */
+static inline void user_fpu_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	preempt_disable();
+	if (!fpregs_active())
+		fpregs_activate(fpu);
+	preempt_enable();
+}
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK	0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+	u32 eax, edx;
+
+	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
+		     : "=a" (eax), "=d" (edx)
+		     : "c" (index));
+	return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+	u32 eax = value;
+	u32 edx = value >> 32;
+
+	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
+		     : : "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/regset.h b/arch/x86/include/asm/fpu/regset.h
new file mode 100644
index 000000000000..39d3107ac6c7
--- /dev/null
+++ b/arch/x86/include/asm/fpu/regset.h
@@ -0,0 +1,21 @@
+/*
+ * FPU regset handling methods:
+ */
+#ifndef _ASM_X86_FPU_REGSET_H
+#define _ASM_X86_FPU_REGSET_H
+
+#include <linux/regset.h>
+
+extern user_regset_active_fn regset_fpregs_active, regset_xregset_fpregs_active;
+extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
+				xstateregs_get;
+extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
+				 xstateregs_set;
+
+/*
+ * xstateregs_active == regset_fpregs_active. Please refer to the comment
+ * at the definition of regset_fpregs_active.
+ */
+#define xstateregs_active	regset_fpregs_active
+
+#endif /* _ASM_X86_FPU_REGSET_H */
diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h
new file mode 100644
index 000000000000..7358e9d61f1e
--- /dev/null
+++ b/arch/x86/include/asm/fpu/signal.h
@@ -0,0 +1,33 @@
+/*
+ * x86 FPU signal frame handling methods:
+ */
+#ifndef _ASM_X86_FPU_SIGNAL_H
+#define _ASM_X86_FPU_SIGNAL_H
+
+#ifdef CONFIG_X86_64
+# include <asm/sigcontext32.h>
+# include <asm/user32.h>
+struct ksignal;
+int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
+			compat_sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct ksignal *ksig,
+		     compat_sigset_t *set, struct pt_regs *regs);
+#else
+# define user_i387_ia32_struct	user_i387_struct
+# define user32_fxsr_struct	user_fxsr_struct
+# define ia32_setup_frame	__setup_frame
+# define ia32_setup_rt_frame	__setup_rt_frame
+#endif
+
+extern void convert_from_fxsr(struct user_i387_ia32_struct *env,
+			      struct task_struct *tsk);
+extern void convert_to_fxsr(struct task_struct *tsk,
+			    const struct user_i387_ia32_struct *env);
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+		     unsigned long *buf_fx, unsigned long *size);
+
+extern void fpu__init_prepare_fx_sw_frame(void);
+
+#endif /* _ASM_X86_FPU_SIGNAL_H */
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
new file mode 100644
index 000000000000..0637826292de
--- /dev/null
+++ b/arch/x86/include/asm/fpu/types.h
@@ -0,0 +1,293 @@
+/*
+ * FPU data structures:
+ */
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+/*
+ * The legacy x87 FPU state format, as saved by FSAVE and
+ * restored by the FRSTOR instructions:
+ */
+struct fregs_state {
+	u32			cwd;	/* FPU Control Word		*/
+	u32			swd;	/* FPU Status Word		*/
+	u32			twd;	/* FPU Tag Word			*/
+	u32			fip;	/* FPU IP Offset		*/
+	u32			fcs;	/* FPU IP Selector		*/
+	u32			foo;	/* FPU Operand Pointer Offset	*/
+	u32			fos;	/* FPU Operand Pointer Selector	*/
+
+	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
+	u32			st_space[20];
+
+	/* Software status information [not touched by FSAVE]:		*/
+	u32			status;
+};
+
+/*
+ * The legacy fx SSE/MMX FPU state format, as saved by FXSAVE and
+ * restored by the FXRSTOR instructions. It's similar to the FSAVE
+ * format, but differs in some areas, plus has extensions at
+ * the end for the XMM registers.
+ */
+struct fxregs_state {
+	u16			cwd; /* Control Word			*/
+	u16			swd; /* Status Word			*/
+	u16			twd; /* Tag Word			*/
+	u16			fop; /* Last Instruction Opcode		*/
+	union {
+		struct {
+			u64	rip; /* Instruction Pointer		*/
+			u64	rdp; /* Data Pointer			*/
+		};
+		struct {
+			u32	fip; /* FPU IP Offset			*/
+			u32	fcs; /* FPU IP Selector			*/
+			u32	foo; /* FPU Operand Offset		*/
+			u32	fos; /* FPU Operand Selector		*/
+		};
+	};
+	u32			mxcsr;		/* MXCSR Register State */
+	u32			mxcsr_mask;	/* MXCSR Mask		*/
+
+	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
+	u32			st_space[32];
+
+	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
+	u32			xmm_space[64];
+
+	u32			padding[12];
+
+	union {
+		u32		padding1[12];
+		u32		sw_reserved[12];
+	};
+
+} __attribute__((aligned(16)));
+
+/* Default value for fxregs_state.mxcsr: */
+#define MXCSR_DEFAULT		0x1f80
+
+/*
+ * Software based FPU emulation state. This is arbitrary really,
+ * it matches the x87 format to make it easier to understand:
+ */
+struct swregs_state {
+	u32			cwd;
+	u32			swd;
+	u32			twd;
+	u32			fip;
+	u32			fcs;
+	u32			foo;
+	u32			fos;
+	/* 8*10 bytes for each FP-reg = 80 bytes: */
+	u32			st_space[20];
+	u8			ftop;
+	u8			changed;
+	u8			lookahead;
+	u8			no_update;
+	u8			rm;
+	u8			alimit;
+	struct math_emu_info	*info;
+	u32			entry_eip;
+};
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature_bit {
+	XSTATE_BIT_FP,
+	XSTATE_BIT_SSE,
+	XSTATE_BIT_YMM,
+	XSTATE_BIT_BNDREGS,
+	XSTATE_BIT_BNDCSR,
+	XSTATE_BIT_OPMASK,
+	XSTATE_BIT_ZMM_Hi256,
+	XSTATE_BIT_Hi16_ZMM,
+
+	XFEATURES_NR_MAX,
+};
+
+#define XSTATE_FP		(1 << XSTATE_BIT_FP)
+#define XSTATE_SSE		(1 << XSTATE_BIT_SSE)
+#define XSTATE_YMM		(1 << XSTATE_BIT_YMM)
+#define XSTATE_BNDREGS		(1 << XSTATE_BIT_BNDREGS)
+#define XSTATE_BNDCSR		(1 << XSTATE_BIT_BNDCSR)
+#define XSTATE_OPMASK		(1 << XSTATE_BIT_OPMASK)
+#define XSTATE_ZMM_Hi256	(1 << XSTATE_BIT_ZMM_Hi256)
+#define XSTATE_Hi16_ZMM		(1 << XSTATE_BIT_Hi16_ZMM)
+
+#define XSTATE_FPSSE		(XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512		(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+
+/*
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]'.
+ *
+ * The high 128 bits are stored here:
+ *    16x 128 bits == 256 bytes.
+ */
+struct ymmh_struct {
+	u8				ymmh_space[256];
+};
+
+/* We don't support LWP yet: */
+struct lwp_struct {
+	u8				reserved[128];
+};
+
+/* Intel MPX support: */
+struct bndreg {
+	u64				lower_bound;
+	u64				upper_bound;
+} __packed;
+
+struct bndcsr {
+	u64				bndcfgu;
+	u64				bndstatus;
+} __packed;
+
+struct mpx_struct {
+	struct bndreg			bndreg[4];
+	struct bndcsr			bndcsr;
+};
+
+struct xstate_header {
+	u64				xfeatures;
+	u64				xcomp_bv;
+	u64				reserved[6];
+} __attribute__((packed));
+
+/* New processor state extensions should be added here: */
+#define XSTATE_RESERVE			(sizeof(struct ymmh_struct) + \
+					 sizeof(struct lwp_struct)  + \
+					 sizeof(struct mpx_struct)  )
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent fixed size areas as defined by the xstate header.
+ * Not all CPUs support all the extensions.
+ */
+struct xregs_state {
+	struct fxregs_state		i387;
+	struct xstate_header		header;
+	u8				__reserved[XSTATE_RESERVE];
+} __attribute__ ((packed, aligned (64)));
+
+/*
+ * This is a union of all the possible FPU state formats
+ * put together, so that we can pick the right one runtime.
+ *
+ * The size of the structure is determined by the largest
+ * member - which is the xsave area:
+ */
+union fpregs_state {
+	struct fregs_state		fsave;
+	struct fxregs_state		fxsave;
+	struct swregs_state		soft;
+	struct xregs_state		xsave;
+};
+
+/*
+ * Highest level per task FPU state data structure that
+ * contains the FPU register state plus various FPU
+ * state fields:
+ */
+struct fpu {
+	/*
+	 * @state:
+	 *
+	 * In-memory copy of all FPU registers that we save/restore
+	 * over context switches. If the task is using the FPU then
+	 * the registers in the FPU are more recent than this state
+	 * copy. If the task context-switches away then they get
+	 * saved here and represent the FPU state.
+	 *
+	 * After context switches there may be a (short) time period
+	 * during which the in-FPU hardware registers are unchanged
+	 * and still perfectly match this state, if the tasks
+	 * scheduled afterwards are not using the FPU.
+	 *
+	 * This is the 'lazy restore' window of optimization, which
+	 * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
+	 *
+	 * We detect whether a subsequent task uses the FPU via setting
+	 * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
+	 *
+	 * During this window, if the task gets scheduled again, we
+	 * might be able to skip having to do a restore from this
+	 * memory buffer to the hardware registers - at the cost of
+	 * incurring the overhead of #NM fault traps.
+	 *
+	 * Note that on modern CPUs that support the XSAVEOPT (or other
+	 * optimized XSAVE instructions), we don't use #NM traps anymore,
+	 * as the hardware can track whether FPU registers need saving
+	 * or not. On such CPUs we activate the non-lazy ('eagerfpu')
+	 * logic, which unconditionally saves/restores all FPU state
+	 * across context switches. (if FPU state exists.)
+	 */
+	union fpregs_state		state;
+
+	/*
+	 * @last_cpu:
+	 *
+	 * Records the last CPU on which this context was loaded into
+	 * FPU registers. (In the lazy-restore case we might be
+	 * able to reuse FPU registers across multiple context switches
+	 * this way, if no intermediate task used the FPU.)
+	 *
+	 * A value of -1 is used to indicate that the FPU state in context
+	 * memory is newer than the FPU state in registers, and that the
+	 * FPU state should be reloaded next time the task is run.
+	 */
+	unsigned int			last_cpu;
+
+	/*
+	 * @fpstate_active:
+	 *
+	 * This flag indicates whether this context is active: if the task
+	 * is not running then we can restore from this context, if the task
+	 * is running then we should save into this context.
+	 */
+	unsigned char			fpstate_active;
+
+	/*
+	 * @fpregs_active:
+	 *
+	 * This flag determines whether a given context is actively
+	 * loaded into the FPU's registers and that those registers
+	 * represent the task's current FPU state.
+	 *
+	 * Note the interaction with fpstate_active:
+	 *
+	 *   # task does not use the FPU:
+	 *   fpstate_active == 0
+	 *
+	 *   # task uses the FPU and regs are active:
+	 *   fpstate_active == 1 && fpregs_active == 1
+	 *
+	 *   # the regs are inactive but still match fpstate:
+	 *   fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
+	 *
+	 * The third state is what we use for the lazy restore optimization
+	 * on lazy-switching CPUs.
+	 */
+	unsigned char			fpregs_active;
+
+	/*
+	 * @counter:
+	 *
+	 * This counter contains the number of consecutive context switches
+	 * during which the FPU stays used. If this is over a threshold, the
+	 * lazy FPU restore logic becomes eager, to save the trap overhead.
+	 * This is an unsigned char so that after 256 iterations the counter
+	 * wraps and the context switch behavior turns lazy again; this is to
+	 * deal with bursty apps that only use the FPU for a short time:
+	 */
+	unsigned char			counter;
+};
+
+#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
new file mode 100644
index 000000000000..4656b25bb9a7
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -0,0 +1,46 @@
+#ifndef __ASM_X86_XSAVE_H
+#define __ASM_X86_XSAVE_H
+
+#include <linux/types.h>
+#include <asm/processor.h>
+#include <linux/uaccess.h>
+
+/* Bit 63 of XCR0 is reserved for future expansion */
+#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
+
+#define XSTATE_CPUID		0x0000000d
+
+#define FXSAVE_SIZE	512
+
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+/* Supported features which support lazy state saving */
+#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
+			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
+
+/* Supported features which require eager state saving */
+#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
+
+/* All currently supported features */
+#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
+
+#ifdef CONFIG_X86_64
+#define REX_PREFIX	"0x48, "
+#else
+#define REX_PREFIX
+#endif
+
+extern unsigned int xstate_size;
+extern u64 xfeatures_mask;
+extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
+
+extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
+
+void *get_xsave_addr(struct xregs_state *xsave, int xstate);
+const void *get_xsave_field_ptr(int xstate_field);
+
+#endif
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 3b629f47eb65..793179cf8e21 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,20 +1,17 @@
 #ifdef __ASSEMBLY__
 
 #include <asm/asm.h>
-#include <asm/dwarf2.h>
 
 /* The annotation hides the frame from the unwinder and makes it look
    like a ordinary ebp save/restore. This avoids some special cases for
    frame pointer later */
 #ifdef CONFIG_FRAME_POINTER
 	.macro FRAME
-	__ASM_SIZE(push,_cfi)	%__ASM_REG(bp)
-	CFI_REL_OFFSET		__ASM_REG(bp), 0
+	__ASM_SIZE(push,)	%__ASM_REG(bp)
 	__ASM_SIZE(mov)		%__ASM_REG(sp), %__ASM_REG(bp)
 	.endm
 	.macro ENDFRAME
-	__ASM_SIZE(pop,_cfi)	%__ASM_REG(bp)
-	CFI_RESTORE		__ASM_REG(bp)
+	__ASM_SIZE(pop,)	%__ASM_REG(bp)
 	.endm
 #else
 	.macro FRAME
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 0f5fb6b6567e..7178043b0e1d 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 #ifdef CONFIG_HAVE_KVM
 	unsigned int kvm_posted_intr_ipis;
+	unsigned int kvm_posted_intr_wakeup_ipis;
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
@@ -33,6 +34,9 @@ typedef struct {
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	unsigned int irq_threshold_count;
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	unsigned int irq_deferred_error_count;
+#endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
 	unsigned int irq_hv_callback_count;
 #endif
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 36f7125945e3..5fa9fb0f8809 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 
 struct irq_data;
+struct hpet_dev;
+struct irq_domain;
+
 extern void hpet_msi_unmask(struct irq_data *data);
 extern void hpet_msi_mask(struct irq_data *data);
-struct hpet_dev;
 extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
 extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
-
-#ifdef CONFIG_PCI_MSI
-extern int default_setup_hpet_msi(unsigned int irq, unsigned int id);
-#else
-static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
-	return -EINVAL;
-}
-#endif
+extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
+extern int hpet_assign_irq(struct irq_domain *domain,
+			   struct hpet_dev *dev, int dev_num);
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index e9571ddabc4f..6615032e19c8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,7 @@
 extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 
@@ -36,43 +37,10 @@ extern asmlinkage void spurious_interrupt(void);
 extern asmlinkage void thermal_interrupt(void);
 extern asmlinkage void reschedule_interrupt(void);
 
-extern asmlinkage void invalidate_interrupt(void);
-extern asmlinkage void invalidate_interrupt0(void);
-extern asmlinkage void invalidate_interrupt1(void);
-extern asmlinkage void invalidate_interrupt2(void);
-extern asmlinkage void invalidate_interrupt3(void);
-extern asmlinkage void invalidate_interrupt4(void);
-extern asmlinkage void invalidate_interrupt5(void);
-extern asmlinkage void invalidate_interrupt6(void);
-extern asmlinkage void invalidate_interrupt7(void);
-extern asmlinkage void invalidate_interrupt8(void);
-extern asmlinkage void invalidate_interrupt9(void);
-extern asmlinkage void invalidate_interrupt10(void);
-extern asmlinkage void invalidate_interrupt11(void);
-extern asmlinkage void invalidate_interrupt12(void);
-extern asmlinkage void invalidate_interrupt13(void);
-extern asmlinkage void invalidate_interrupt14(void);
-extern asmlinkage void invalidate_interrupt15(void);
-extern asmlinkage void invalidate_interrupt16(void);
-extern asmlinkage void invalidate_interrupt17(void);
-extern asmlinkage void invalidate_interrupt18(void);
-extern asmlinkage void invalidate_interrupt19(void);
-extern asmlinkage void invalidate_interrupt20(void);
-extern asmlinkage void invalidate_interrupt21(void);
-extern asmlinkage void invalidate_interrupt22(void);
-extern asmlinkage void invalidate_interrupt23(void);
-extern asmlinkage void invalidate_interrupt24(void);
-extern asmlinkage void invalidate_interrupt25(void);
-extern asmlinkage void invalidate_interrupt26(void);
-extern asmlinkage void invalidate_interrupt27(void);
-extern asmlinkage void invalidate_interrupt28(void);
-extern asmlinkage void invalidate_interrupt29(void);
-extern asmlinkage void invalidate_interrupt30(void);
-extern asmlinkage void invalidate_interrupt31(void);
-
 extern asmlinkage void irq_move_cleanup_interrupt(void);
 extern asmlinkage void reboot_interrupt(void);
 extern asmlinkage void threshold_interrupt(void);
+extern asmlinkage void deferred_error_interrupt(void);
 
 extern asmlinkage void call_function_interrupt(void);
 extern asmlinkage void call_function_single_interrupt(void);
@@ -87,60 +55,93 @@ extern void trace_spurious_interrupt(void);
 extern void trace_thermal_interrupt(void);
 extern void trace_reschedule_interrupt(void);
 extern void trace_threshold_interrupt(void);
+extern void trace_deferred_error_interrupt(void);
 extern void trace_call_function_interrupt(void);
 extern void trace_call_function_single_interrupt(void);
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
 #endif /* CONFIG_TRACING */
 
-#ifdef CONFIG_IRQ_REMAP
-/* Intel specific interrupt remapping information */
-struct irq_2_iommu {
-	struct intel_iommu *iommu;
-	u16 irte_index;
-	u16 sub_handle;
-	u8  irte_mask;
-};
-
-/* AMD specific interrupt remapping information */
-struct irq_2_irte {
-	u16 devid; /* Device ID for IRTE table */
-	u16 index; /* Index into IRTE table*/
-};
-#endif	/* CONFIG_IRQ_REMAP */
-
 #ifdef	CONFIG_X86_LOCAL_APIC
 struct irq_data;
+struct pci_dev;
+struct msi_desc;
+
+enum irq_alloc_type {
+	X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
+	X86_IRQ_ALLOC_TYPE_HPET,
+	X86_IRQ_ALLOC_TYPE_MSI,
+	X86_IRQ_ALLOC_TYPE_MSIX,
+	X86_IRQ_ALLOC_TYPE_DMAR,
+	X86_IRQ_ALLOC_TYPE_UV,
+};
 
-struct irq_cfg {
-	cpumask_var_t		domain;
-	cpumask_var_t		old_domain;
-	u8			vector;
-	u8			move_in_progress : 1;
-#ifdef CONFIG_IRQ_REMAP
-	u8			remapped : 1;
+struct irq_alloc_info {
+	enum irq_alloc_type	type;
+	u32			flags;
+	const struct cpumask	*mask;	/* CPU mask for vector allocation */
 	union {
-		struct irq_2_iommu irq_2_iommu;
-		struct irq_2_irte  irq_2_irte;
-	};
+		int		unused;
+#ifdef	CONFIG_HPET_TIMER
+		struct {
+			int		hpet_id;
+			int		hpet_index;
+			void		*hpet_data;
+		};
 #endif
-	union {
-#ifdef CONFIG_X86_IO_APIC
+#ifdef	CONFIG_PCI_MSI
 		struct {
-			struct list_head	irq_2_pin;
+			struct pci_dev	*msi_dev;
+			irq_hw_number_t	msi_hwirq;
+		};
+#endif
+#ifdef	CONFIG_X86_IO_APIC
+		struct {
+			int		ioapic_id;
+			int		ioapic_pin;
+			int		ioapic_node;
+			u32		ioapic_trigger : 1;
+			u32		ioapic_polarity : 1;
+			u32		ioapic_valid : 1;
+			struct IO_APIC_route_entry *ioapic_entry;
+		};
+#endif
+#ifdef	CONFIG_DMAR_TABLE
+		struct {
+			int		dmar_id;
+			void		*dmar_data;
+		};
+#endif
+#ifdef	CONFIG_HT_IRQ
+		struct {
+			int		ht_pos;
+			int		ht_idx;
+			struct pci_dev	*ht_dev;
+			void		*ht_update;
+		};
+#endif
+#ifdef	CONFIG_X86_UV
+		struct {
+			int		uv_limit;
+			int		uv_blade;
+			unsigned long	uv_offset;
+			char		*uv_name;
 		};
 #endif
 	};
 };
 
+struct irq_cfg {
+	unsigned int		dest_apicid;
+	u8			vector;
+};
+
 extern struct irq_cfg *irq_cfg(unsigned int irq);
 extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
-extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 extern void lock_vector_lock(void);
 extern void unlock_vector_lock(void);
-extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
-extern void clear_irq_vector(int irq, struct irq_cfg *cfg);
 extern void setup_vector_irq(int cpu);
 #ifdef CONFIG_SMP
 extern void send_cleanup_vector(struct irq_cfg *);
@@ -150,10 +151,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { }
 static inline void irq_complete_move(struct irq_cfg *c) { }
 #endif
 
-extern int apic_retrigger_irq(struct irq_data *data);
 extern void apic_ack_edge(struct irq_data *data);
-extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			     unsigned int *dest_id);
 #else	/*  CONFIG_X86_LOCAL_APIC */
 static inline void lock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
@@ -163,8 +161,7 @@ static inline void unlock_vector_lock(void) {}
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
-/* EISA */
-extern void eisa_set_level_irq(unsigned int irq);
+extern void elcr_set_level_irq(unsigned int irq);
 
 /* SMP */
 extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
@@ -178,7 +175,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 extern __visible void smp_reschedule_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-extern __visible void smp_invalidate_interrupt(struct pt_regs *);
 #endif
 
 extern char irq_entries_start[];
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
deleted file mode 100644
index 6eb6fcb83f63..000000000000
--- a/arch/x86/include/asm/i387.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- *
- * Pentium III FXSR, SSE support
- * General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- * x86-64 work by Andi Kleen 2002
- */
-
-#ifndef _ASM_X86_I387_H
-#define _ASM_X86_I387_H
-
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-
-struct pt_regs;
-struct user_i387_struct;
-
-extern int init_fpu(struct task_struct *child);
-extern void fpu_finit(struct fpu *fpu);
-extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
-extern void math_state_restore(void);
-
-extern bool irq_fpu_usable(void);
-
-/*
- * Careful: __kernel_fpu_begin/end() must be called with preempt disabled
- * and they don't touch the preempt state on their own.
- * If you enable preemption after __kernel_fpu_begin(), preempt notifier
- * should call the __kernel_fpu_end() to prevent the kernel/user FPU
- * state from getting corrupted. KVM for example uses this model.
- *
- * All other cases use kernel_fpu_begin/end() which disable preemption
- * during kernel FPU usage.
- */
-extern void __kernel_fpu_begin(void);
-extern void __kernel_fpu_end(void);
-
-static inline void kernel_fpu_begin(void)
-{
-	preempt_disable();
-	WARN_ON_ONCE(!irq_fpu_usable());
-	__kernel_fpu_begin();
-}
-
-static inline void kernel_fpu_end(void)
-{
-	__kernel_fpu_end();
-	preempt_enable();
-}
-
-/* Must be called with preempt disabled */
-extern void kernel_fpu_disable(void);
-extern void kernel_fpu_enable(void);
-
-/*
- * Some instructions like VIA's padlock instructions generate a spurious
- * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context as well. To prevent these kernel instructions
- * in interrupt context interacting wrongly with other user/kernel fpu usage, we
- * should use them only in the context of irq_ts_save/restore()
- */
-static inline int irq_ts_save(void)
-{
-	/*
-	 * If in process context and not atomic, we can take a spurious DNA fault.
-	 * Otherwise, doing clts() in process context requires disabling preemption
-	 * or some heavy lifting like kernel_fpu_begin()
-	 */
-	if (!in_atomic())
-		return 0;
-
-	if (read_cr0() & X86_CR0_TS) {
-		clts();
-		return 1;
-	}
-
-	return 0;
-}
-
-static inline void irq_ts_restore(int TS_state)
-{
-	if (TS_state)
-		stts();
-}
-
-/*
- * The question "does this thread have fpu access?"
- * is slightly racy, since preemption could come in
- * and revoke it immediately after the test.
- *
- * However, even in that very unlikely scenario,
- * we can just assume we have FPU access - typically
- * to save the FP state - we'll just take a #NM
- * fault and get the FPU access back.
- */
-static inline int user_has_fpu(void)
-{
-	return current->thread.fpu.has_fpu;
-}
-
-extern void unlazy_fpu(struct task_struct *tsk);
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 34a5b93704d3..83ec9b1d77cc 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -35,11 +35,13 @@
   */
 
 #define ARCH_HAS_IOREMAP_WC
+#define ARCH_HAS_IOREMAP_WT
 
 #include <linux/string.h>
 #include <linux/compiler.h>
 #include <asm/page.h>
 #include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
 
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
@@ -177,6 +179,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * look at pci_iomap().
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
@@ -197,8 +200,6 @@ extern void set_iounmap_nonlazy(void);
 
 #include <asm-generic/iomap.h>
 
-#include <linux/vmalloc.h>
-
 /*
  * Convert a virtual cached pointer to an uncached pointer
  */
@@ -320,6 +321,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
 extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 				enum page_cache_mode pcm);
 extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
 
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
@@ -338,6 +340,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 #define IO_SPACE_LIMIT 0xffff
 
 #ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
 extern int __must_check arch_phys_wc_add(unsigned long base,
 					 unsigned long size);
 extern void arch_phys_wc_del(int handle);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 2f91685fe1cd..6cbf2cfb3f8a 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry {
 		index		: 15;
 } __attribute__ ((packed));
 
-#define IOAPIC_AUTO     -1
-#define IOAPIC_EDGE     0
-#define IOAPIC_LEVEL    1
+struct irq_alloc_info;
+struct ioapic_domain_cfg;
+
+#define IOAPIC_AUTO			-1
+#define IOAPIC_EDGE			0
+#define IOAPIC_LEVEL			1
+
+#define IOAPIC_MASKED			1
+#define IOAPIC_UNMASKED			0
+
+#define IOAPIC_POL_HIGH			0
+#define IOAPIC_POL_LOW			1
+
+#define IOAPIC_DEST_MODE_PHYSICAL	0
+#define IOAPIC_DEST_MODE_LOGICAL	1
+
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_CHECK		0x2
 
@@ -110,9 +123,6 @@ extern int nr_ioapics;
 
 extern int mpc_ioapic_id(int ioapic);
 extern unsigned int mpc_ioapic_addr(int ioapic);
-extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
-
-#define MP_MAX_IOAPIC_PIN 127
 
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
@@ -120,9 +130,6 @@ extern int mp_irq_entries;
 /* MP IRQ source entries */
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
-/* Older SiS APIC requires we rewrite the index register */
-extern int sis_apic_bug;
-
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 
@@ -132,6 +139,8 @@ extern int noioapicquirk;
 /* -1 if "noapic" boot option passed */
 extern int noioapicreroute;
 
+extern u32 gsi_top;
+
 extern unsigned long io_apic_irqs;
 
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
@@ -147,13 +156,6 @@ struct irq_cfg;
 extern void ioapic_insert_resources(void);
 extern int arch_early_ioapic_init(void);
 
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
-				     unsigned int, int,
-				     struct io_apic_irq_attr *);
-extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
-
-extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-
 extern int save_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
 extern int restore_ioapic_entries(void);
@@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
-struct io_apic_irq_attr {
-	int ioapic;
-	int ioapic_pin;
-	int trigger;
-	int polarity;
-};
-
-enum ioapic_domain_type {
-	IOAPIC_DOMAIN_INVALID,
-	IOAPIC_DOMAIN_LEGACY,
-	IOAPIC_DOMAIN_STRICT,
-	IOAPIC_DOMAIN_DYNAMIC,
-};
-
-struct device_node;
-struct irq_domain;
-struct irq_domain_ops;
-
-struct ioapic_domain_cfg {
-	enum ioapic_domain_type		type;
-	const struct irq_domain_ops	*ops;
-	struct device_node		*dev;
-};
-
-struct mp_ioapic_gsi{
-	u32 gsi_base;
-	u32 gsi_end;
-};
-extern u32 gsi_top;
-
 extern int mp_find_ioapic(u32 gsi);
 extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
-extern u32 mp_pin_to_gsi(int ioapic, int pin);
-extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+			     struct irq_alloc_info *info);
 extern void mp_unmap_irq(int irq);
 extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 			      struct ioapic_domain_cfg *cfg);
 extern int mp_unregister_ioapic(u32 gsi_base);
 extern int mp_ioapic_registered(u32 gsi_base);
-extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-			    irq_hw_number_t hwirq);
-extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
-extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
-extern void __init pre_init_apic_IRQ0(void);
+
+extern void ioapic_set_alloc_attr(struct irq_alloc_info *info,
+				  int node, int trigger, int polarity);
 
 extern void mp_save_irq(struct mpc_intsrc *m);
 
 extern void disable_ioapic_support(void);
 
-extern void __init native_io_apic_init_mappings(void);
+extern void __init io_apic_init_mappings(void);
 extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
-extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
-extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
 extern void native_disable_io_apic(void);
-extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern int native_ioapic_set_affinity(struct irq_data *,
-				      const struct cpumask *,
-				      bool);
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
 	return x86_io_apic_ops.read(apic, reg);
 }
 
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.write(apic, reg, value);
-}
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.modify(apic, reg, value);
-}
-
-extern void io_apic_eoi(unsigned int apic, unsigned int vector);
-
 extern void setup_IO_APIC(void);
 extern void enable_IO_APIC(void);
 extern void disable_IO_APIC(void);
@@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; }
 static inline void print_IO_APICs(void) {}
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
-static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+				    struct irq_alloc_info *info)
+{
+	return gsi;
+}
+
 static inline void mp_unmap_irq(int irq) { }
 
 static inline int save_ioapic_entries(void)
@@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void)
 	return -ENOMEM;
 }
 
-static inline void mp_save_irq(struct mpc_intsrc *m) { };
+static inline void mp_save_irq(struct mpc_intsrc *m) { }
 static inline void disable_ioapic_support(void) { }
-#define native_io_apic_init_mappings	NULL
+static inline void io_apic_init_mappings(void) { }
 #define native_io_apic_read		NULL
-#define native_io_apic_write		NULL
-#define native_io_apic_modify		NULL
 #define native_disable_io_apic		NULL
-#define native_io_apic_print_entries	NULL
-#define native_ioapic_set_affinity	NULL
-#define native_setup_ioapic_entry	NULL
-#define native_eoi_ioapic_pin		NULL
 
 static inline void setup_IO_APIC(void) { }
 static inline void enable_IO_APIC(void) { }
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index a80cbb88ea91..8008d06581c7 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -30,6 +30,10 @@ extern void fixup_irqs(void);
 extern void irq_force_complete_move(int);
 #endif
 
+#ifdef CONFIG_HAVE_KVM
+extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
+#endif
+
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 6224d316c405..046c7fb1ca43 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -22,84 +22,72 @@
 #ifndef __X86_IRQ_REMAPPING_H
 #define __X86_IRQ_REMAPPING_H
 
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
 #include <asm/io_apic.h>
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_chip;
 struct msi_msg;
-struct pci_dev;
-struct irq_cfg;
+struct irq_alloc_info;
+
+enum irq_remap_cap {
+	IRQ_POSTING_CAP = 0,
+};
 
 #ifdef CONFIG_IRQ_REMAP
 
+extern bool irq_remapping_cap(enum irq_remap_cap cap);
 extern void set_irq_remapping_broken(void);
 extern int irq_remapping_prepare(void);
 extern int irq_remapping_enable(void);
 extern void irq_remapping_disable(void);
 extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
-extern int setup_ioapic_remapped_entry(int irq,
-				       struct IO_APIC_route_entry *entry,
-				       unsigned int destination,
-				       int vector,
-				       struct io_apic_irq_attr *attr);
-extern void free_remapped_irq(int irq);
-extern void compose_remapped_msi_msg(struct pci_dev *pdev,
-				     unsigned int irq, unsigned int dest,
-				     struct msi_msg *msg, u8 hpet_id);
-extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
 extern void panic_if_irq_remap(const char *msg);
-extern bool setup_remapped_irq(int irq,
-			       struct irq_cfg *cfg,
-			       struct irq_chip *chip);
 
-void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+extern struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+
+/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
+extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent);
+
+/* Get parent irqdomain for interrupt remapping irqdomain */
+static inline struct irq_domain *arch_get_ir_parent_domain(void)
+{
+	return x86_vector_domain;
+}
+
+struct vcpu_data {
+	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
+	u32 vector;		/* Guest vector of the interrupt */
+};
 
 #else  /* CONFIG_IRQ_REMAP */
 
+static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
 static inline void set_irq_remapping_broken(void) { }
 static inline int irq_remapping_prepare(void) { return -ENODEV; }
 static inline int irq_remapping_enable(void) { return -ENODEV; }
 static inline void irq_remapping_disable(void) { }
 static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
 static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
-static inline int setup_ioapic_remapped_entry(int irq,
-					      struct IO_APIC_route_entry *entry,
-					      unsigned int destination,
-					      int vector,
-					      struct io_apic_irq_attr *attr)
-{
-	return -ENODEV;
-}
-static inline void free_remapped_irq(int irq) { }
-static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
-					    unsigned int irq, unsigned int dest,
-					    struct msi_msg *msg, u8 hpet_id)
-{
-}
-static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	return -ENODEV;
-}
 
 static inline void panic_if_irq_remap(const char *msg)
 {
 }
 
-static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
 {
+	return NULL;
 }
 
-static inline bool setup_remapped_irq(int irq,
-				      struct irq_cfg *cfg,
-				      struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
-	return false;
+	return NULL;
 }
-#endif /* CONFIG_IRQ_REMAP */
-
-#define dmar_alloc_hwirq()	irq_alloc_hwirq(-1)
-#define dmar_free_hwirq		irq_free_hwirq
 
+#endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 666c89ec4bd7..4c2d2eb2060a 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -47,31 +47,12 @@
 #define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
 
 #define IA32_SYSCALL_VECTOR		0x80
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR			0x80
-#endif
 
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
  *   round up to the next 16-vector boundary
  */
-#define IRQ0_VECTOR			((FIRST_EXTERNAL_VECTOR + 16) & ~15)
-
-#define IRQ1_VECTOR			(IRQ0_VECTOR +  1)
-#define IRQ2_VECTOR			(IRQ0_VECTOR +  2)
-#define IRQ3_VECTOR			(IRQ0_VECTOR +  3)
-#define IRQ4_VECTOR			(IRQ0_VECTOR +  4)
-#define IRQ5_VECTOR			(IRQ0_VECTOR +  5)
-#define IRQ6_VECTOR			(IRQ0_VECTOR +  6)
-#define IRQ7_VECTOR			(IRQ0_VECTOR +  7)
-#define IRQ8_VECTOR			(IRQ0_VECTOR +  8)
-#define IRQ9_VECTOR			(IRQ0_VECTOR +  9)
-#define IRQ10_VECTOR			(IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR			(IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR			(IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR			(IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR			(IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR			(IRQ0_VECTOR + 15)
+#define ISA_IRQ_VECTOR(irq)		(((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq)
 
 /*
  * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -102,21 +83,23 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
-/* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
-#define POSTED_INTR_VECTOR		0xf2
-#endif
-
+#define POSTED_INTR_WAKEUP_VECTOR	0xf1
 /*
  * IRQ work vector:
  */
 #define IRQ_WORK_VECTOR			0xf6
 
 #define UV_BAU_MESSAGE			0xf5
+#define DEFERRED_ERROR_VECTOR		0xf4
 
 /* Vector on which hypervisor callbacks will be delivered */
 #define HYPERVISOR_CALLBACK_VECTOR	0xf3
 
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR		0xf2
+#endif
+
 /*
  * Local APIC timer IRQ vector is on a different priority level,
  * to work around the 'lost local interrupt if more than 2 IRQ
@@ -155,18 +138,22 @@ static inline int invalid_vm86_irq(int irq)
  * static arrays.
  */
 
-#define NR_IRQS_LEGACY			  16
+#define NR_IRQS_LEGACY			16
 
-#define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
+#define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
+#define IO_APIC_VECTOR_LIMIT		(32 * MAX_IO_APICS)
 
-#ifdef CONFIG_X86_IO_APIC
-# define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
-# define NR_IRQS					\
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_IRQS						\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS			NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define	NR_IRQS				(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_IRQS				(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_IRQS				NR_IRQS_LEGACY
 #endif
 
 #endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
new file mode 100644
index 000000000000..d26075b52885
--- /dev/null
+++ b/arch/x86/include/asm/irqdomain.h
@@ -0,0 +1,63 @@
+#ifndef _ASM_IRQDOMAIN_H
+#define _ASM_IRQDOMAIN_H
+
+#include <linux/irqdomain.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+enum {
+	/* Allocate contiguous CPU vectors */
+	X86_IRQ_ALLOC_CONTIGUOUS_VECTORS		= 0x1,
+};
+
+extern struct irq_domain *x86_vector_domain;
+
+extern void init_irq_alloc_info(struct irq_alloc_info *info,
+				const struct cpumask *mask);
+extern void copy_irq_alloc_info(struct irq_alloc_info *dst,
+				struct irq_alloc_info *src);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+struct device_node;
+struct irq_data;
+
+enum ioapic_domain_type {
+	IOAPIC_DOMAIN_INVALID,
+	IOAPIC_DOMAIN_LEGACY,
+	IOAPIC_DOMAIN_STRICT,
+	IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct ioapic_domain_cfg {
+	enum ioapic_domain_type		type;
+	const struct irq_domain_ops	*ops;
+	struct device_node		*dev;
+};
+
+extern const struct irq_domain_ops mp_ioapic_irqdomain_ops;
+
+extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg);
+extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs);
+extern void mp_irqdomain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data);
+extern void mp_irqdomain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data);
+extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_PCI_MSI
+extern void arch_init_msi_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+#endif
+
+#ifdef CONFIG_HT_IRQ
+extern void arch_init_htirq_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 57a9d94fe160..e16466ec473c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,8 @@ struct x86_emulate_ops {
 	int (*cpl)(struct x86_emulate_ctxt *ctxt);
 	int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
 	int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
+	u64 (*get_smbase)(struct x86_emulate_ctxt *ctxt);
+	void (*set_smbase)(struct x86_emulate_ctxt *ctxt, u64 smbase);
 	int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
 	int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
 	int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc);
@@ -262,6 +264,11 @@ enum x86emul_mode {
 	X86EMUL_MODE_PROT64,	/* 64-bit (long) mode.    */
 };
 
+/* These match some of the HF_* flags defined in kvm_host.h  */
+#define X86EMUL_GUEST_MASK           (1 << 5) /* VCPU is in guest-mode */
+#define X86EMUL_SMM_MASK             (1 << 6)
+#define X86EMUL_SMM_INSIDE_NMI_MASK  (1 << 7)
+
 struct x86_emulate_ctxt {
 	const struct x86_emulate_ops *ops;
 
@@ -273,8 +280,8 @@ struct x86_emulate_ctxt {
 
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
+	int emul_flags;
 
-	bool guest_mode; /* guest running a nested guest */
 	bool perm_ok; /* do not check permissions if true */
 	bool ud;	/* inject an #UD if host doesn't support insn */
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dea2e7e962e3..c7fa57b529d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -184,29 +184,28 @@ struct kvm_mmu_memory_cache {
 	void *objects[KVM_NR_MEM_OBJS];
 };
 
-/*
- * kvm_mmu_page_role, below, is defined as:
- *
- *   bits 0:3 - total guest paging levels (2-4, or zero for real mode)
- *   bits 4:7 - page table level for this shadow (1-4)
- *   bits 8:9 - page table quadrant for 2-level guests
- *   bit   16 - direct mapping of virtual to physical mapping at gfn
- *              used for real mode and two-dimensional paging
- *   bits 17:19 - common access permissions for all ptes in this shadow page
- */
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
 		unsigned level:4;
 		unsigned cr4_pae:1;
 		unsigned quadrant:2;
-		unsigned pad_for_nice_hex_output:6;
 		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned nxe:1;
 		unsigned cr0_wp:1;
 		unsigned smep_andnot_wp:1;
+		unsigned smap_andnot_wp:1;
+		unsigned :8;
+
+		/*
+		 * This is left at the top of the word so that
+		 * kvm_memslots_for_spte_role can extract it with a
+		 * simple shift.  While there is room, give it a whole
+		 * byte so it is also faster to load it from memory.
+		 */
+		unsigned smm:8;
 	};
 };
 
@@ -337,12 +336,28 @@ struct kvm_pmu {
 	u64 reprogram_pmi;
 };
 
+struct kvm_pmu_ops;
+
 enum {
 	KVM_DEBUGREG_BP_ENABLED = 1,
 	KVM_DEBUGREG_WONT_EXIT = 2,
 	KVM_DEBUGREG_RELOAD = 4,
 };
 
+struct kvm_mtrr_range {
+	u64 base;
+	u64 mask;
+	struct list_head node;
+};
+
+struct kvm_mtrr {
+	struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR];
+	mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION];
+	u64 deftype;
+
+	struct list_head head;
+};
+
 struct kvm_vcpu_arch {
 	/*
 	 * rip and regs accesses must go through
@@ -367,6 +382,7 @@ struct kvm_vcpu_arch {
 	int32_t apic_arb_prio;
 	int mp_state;
 	u64 ia32_misc_enable_msr;
+	u64 smbase;
 	bool tpr_access_reporting;
 	u64 ia32_xss;
 
@@ -400,6 +416,7 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
 	struct fpu guest_fpu;
+	bool eager_fpu;
 	u64 xcr0;
 	u64 guest_supported_xcr0;
 	u32 guest_xstate_size;
@@ -469,8 +486,9 @@ struct kvm_vcpu_arch {
 	atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
 	unsigned nmi_pending; /* NMI queued after currently running handler */
 	bool nmi_injected;    /* Trying to inject an NMI this entry */
+	bool smi_pending;    /* SMI queued after currently running handler */
 
-	struct mtrr_state_type mtrr_state;
+	struct kvm_mtrr mtrr_state;
 	u64 pat;
 
 	unsigned switch_db_regs;
@@ -635,6 +653,8 @@ struct kvm_arch {
 	#endif
 
 	bool boot_vcpu_runs_old_kvmclock;
+
+	u64 disabled_quirks;
 };
 
 struct kvm_vm_stat {
@@ -687,12 +707,13 @@ struct msr_data {
 
 struct kvm_lapic_irq {
 	u32 vector;
-	u32 delivery_mode;
-	u32 dest_mode;
-	u32 level;
-	u32 trig_mode;
+	u16 delivery_mode;
+	u16 dest_mode;
+	bool level;
+	u16 trig_mode;
 	u32 shorthand;
 	u32 dest_id;
+	bool msi_redir_hint;
 };
 
 struct kvm_x86_ops {
@@ -704,19 +725,20 @@ struct kvm_x86_ops {
 	int (*hardware_setup)(void);               /* __init */
 	void (*hardware_unsetup)(void);            /* __exit */
 	bool (*cpu_has_accelerated_tpr)(void);
+	bool (*cpu_has_high_real_mode_segbase)(void);
 	void (*cpuid_update)(struct kvm_vcpu *vcpu);
 
 	/* Create, but do not attach this VCPU */
 	struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
 	void (*vcpu_free)(struct kvm_vcpu *vcpu);
-	void (*vcpu_reset)(struct kvm_vcpu *vcpu);
+	void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
 
 	void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
 	void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
 	void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
 	void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
-	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
+	int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
 	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
 	u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
 	void (*get_segment)(struct kvm_vcpu *vcpu,
@@ -743,6 +765,7 @@ struct kvm_x86_ops {
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+	void (*fpu_activate)(struct kvm_vcpu *vcpu);
 	void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
 
 	void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -833,6 +856,8 @@ struct kvm_x86_ops {
 	void (*enable_log_dirty_pt_masked)(struct kvm *kvm,
 					   struct kvm_memory_slot *slot,
 					   gfn_t offset, unsigned long mask);
+	/* pmu operations of sub-arch */
+	const struct kvm_pmu_ops *pmu_ops;
 };
 
 struct kvm_arch_async_pf {
@@ -868,7 +893,7 @@ void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot);
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-					struct kvm_memory_slot *memslot);
+				   const struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
@@ -879,7 +904,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				   struct kvm_memory_slot *slot,
 				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
@@ -887,7 +912,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 struct kvm_irq_mask_notifier {
 	void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
@@ -935,7 +959,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 
 void kvm_enable_efer_bits(u64);
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer);
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 struct x86_emulate_ctxt;
@@ -964,7 +988,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
 
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
@@ -999,8 +1023,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-int fx_init(struct kvm_vcpu *vcpu);
-
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
@@ -1109,6 +1131,14 @@ enum {
 #define HF_NMI_MASK		(1 << 3)
 #define HF_IRET_MASK		(1 << 4)
 #define HF_GUEST_MASK		(1 << 5) /* VCPU is in guest-mode */
+#define HF_SMM_MASK		(1 << 6)
+#define HF_SMM_INSIDE_NMI_MASK	(1 << 7)
+
+#define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE
+#define KVM_ADDRESS_SPACE_NUM 2
+
+#define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0)
+#define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm)
 
 /*
  * Hardware virtualization extension instructions may fault if a
@@ -1143,7 +1173,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu);
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 					   unsigned long address);
@@ -1167,16 +1197,9 @@ void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
 
 int kvm_is_in_guest(void);
 
-void kvm_pmu_init(struct kvm_vcpu *vcpu);
-void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
-void kvm_pmu_reset(struct kvm_vcpu *vcpu);
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu);
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr);
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc);
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu);
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu);
+int __x86_set_memory_region(struct kvm *kvm,
+			    const struct kvm_userspace_memory_region *mem);
+int x86_set_memory_region(struct kvm *kvm,
+			  const struct kvm_userspace_memory_region *mem);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 2d29197bd2fb..19c099afa861 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -21,6 +21,7 @@
 #ifndef _ASM_X86_LIVEPATCH_H
 #define _ASM_X86_LIVEPATCH_H
 
+#include <asm/setup.h>
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1f5a86d518db..982dfc3679ad 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -17,11 +17,16 @@
 #define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
 #define MCG_SER_P		(1ULL<<24)   /* MCA recovery/new status bits */
 #define MCG_ELOG_P		(1ULL<<26)   /* Extended error log supported */
+#define MCG_LMCE_P		(1ULL<<27)   /* Local machine check supported */
 
 /* MCG_STATUS register defines */
 #define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
+#define MCG_STATUS_LMCES (1ULL<<3)   /* LMCE signaled */
+
+/* MCG_EXT_CTL register defines */
+#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
 
 /* MCi_STATUS register defines */
 #define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
@@ -104,6 +109,7 @@ struct mce_log {
 struct mca_config {
 	bool dont_log_ce;
 	bool cmci_disabled;
+	bool lmce_disabled;
 	bool ignore_ce;
 	bool disabled;
 	bool ser;
@@ -117,8 +123,19 @@ struct mca_config {
 };
 
 struct mce_vendor_flags {
-	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */
-			__reserved_0	: 63;
+			/*
+			 * overflow recovery cpuid bit indicates that overflow
+			 * conditions are not fatal
+			 */
+	__u64		overflow_recov	: 1,
+
+			/*
+			 * SUCCOR stands for S/W UnCorrectable error COntainment
+			 * and Recovery. It indicates support for data poisoning
+			 * in HW and deferred error interrupts.
+			 */
+			succor		: 1,
+			__reserved_0	: 62;
 };
 extern struct mce_vendor_flags mce_flags;
 
@@ -168,12 +185,16 @@ void cmci_clear(void);
 void cmci_reenable(void);
 void cmci_rediscover(void);
 void cmci_recheck(void);
+void lmce_clear(void);
+void lmce_enable(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
 static inline void cmci_clear(void) {}
 static inline void cmci_reenable(void) {}
 static inline void cmci_rediscover(void) {}
 static inline void cmci_recheck(void) {}
+static inline void lmce_clear(void) {}
+static inline void lmce_enable(void) {}
 #endif
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -223,6 +244,9 @@ void do_machine_check(struct pt_regs *, long);
 extern void (*mce_threshold_vector)(void);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/* Deferred error interrupt handler */
+extern void (*deferred_error_int_vector)(void);
+
 /*
  * Thermal handler
  */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 2fb20d6f7e23..9e6278c7140e 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_MICROCODE_H
 #define _ASM_X86_MICROCODE_H
 
+#include <linux/earlycpio.h>
+
 #define native_rdmsr(msr, val1, val2)			\
 do {							\
 	u64 __val = native_read_msr((msr));		\
@@ -152,6 +154,7 @@ extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 extern int __init save_microcode_in_initrd(void);
 void reload_early_microcode(void);
+extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
 #else
 static inline void __init load_ucode_bsp(void) {}
 static inline void load_ucode_ap(void) {}
@@ -160,6 +163,9 @@ static inline int __init save_microcode_in_initrd(void)
 	return 0;
 }
 static inline void reload_early_microcode(void) {}
+static inline bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+	return false;
+}
 #endif
-
 #endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index af935397e053..ac6d328977a6 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -65,12 +65,12 @@ extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, s
 extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
 
 #ifdef CONFIG_MICROCODE_AMD_EARLY
-extern void __init load_ucode_amd_bsp(void);
+extern void __init load_ucode_amd_bsp(unsigned int family);
 extern void load_ucode_amd_ap(void);
 extern int __init save_microcode_in_initrd_amd(void);
 void reload_ucode_amd(void);
 #else
-static inline void __init load_ucode_amd_bsp(void) {}
+static inline void __init load_ucode_amd_bsp(unsigned int family) {}
 static inline void load_ucode_amd_ap(void) {}
 static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; }
 void reload_ucode_amd(void) {}
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 2b9209c46ca9..7991c606125d 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -51,20 +51,11 @@ struct extended_sigtable {
 	(((struct microcode_intel *)mc)->hdr.datasize ? \
 	 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
 
-#define sigmatch(s1, s2, p1, p2) \
-	(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
+extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
 extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
-
-static inline int
-revision_is_newer(struct microcode_header_intel *mc_header, int rev)
-{
-	return (mc_header->rev <= rev) ? 0 : 1;
-}
+extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
 
 #ifdef CONFIG_MICROCODE_INTEL_EARLY
 extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 883f6b933fa4..5e8daee7c5c9 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -142,6 +142,19 @@ static inline void arch_exit_mmap(struct mm_struct *mm)
 	paravirt_arch_exit_mmap(mm);
 }
 
+#ifdef CONFIG_X86_64
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+	return	!config_enabled(CONFIG_IA32_EMULATION) ||
+		!(mm->context.ia32_compat == TIF_IA32);
+}
+#else
+static inline bool is_64bit_mm(struct mm_struct *mm)
+{
+	return false;
+}
+#endif
+
 static inline void arch_bprm_mm_init(struct mm_struct *mm,
 		struct vm_area_struct *vma)
 {
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a952a13d59a7..7a35495275a9 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -13,55 +13,50 @@
 #define MPX_BNDCFG_ENABLE_FLAG	0x1
 #define MPX_BD_ENTRY_VALID_FLAG	0x1
 
-#ifdef CONFIG_X86_64
-
-/* upper 28 bits [47:20] of the virtual address in 64-bit used to
- * index into bounds directory (BD).
- */
-#define MPX_BD_ENTRY_OFFSET	28
-#define MPX_BD_ENTRY_SHIFT	3
-/* bits [19:3] of the virtual address in 64-bit used to index into
- * bounds table (BT).
+/*
+ * The upper 28 bits [47:20] of the virtual address in 64-bit
+ * are used to index into bounds directory (BD).
+ *
+ * The directory is 2G (2^31) in size, and with 8-byte entries
+ * it has 2^28 entries.
  */
-#define MPX_BT_ENTRY_OFFSET	17
-#define MPX_BT_ENTRY_SHIFT	5
-#define MPX_IGN_BITS		3
-#define MPX_BD_ENTRY_TAIL	3
+#define MPX_BD_SIZE_BYTES_64	(1UL<<31)
+#define MPX_BD_ENTRY_BYTES_64	8
+#define MPX_BD_NR_ENTRIES_64	(MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64)
 
-#else
-
-#define MPX_BD_ENTRY_OFFSET	20
-#define MPX_BD_ENTRY_SHIFT	2
-#define MPX_BT_ENTRY_OFFSET	10
-#define MPX_BT_ENTRY_SHIFT	4
-#define MPX_IGN_BITS		2
-#define MPX_BD_ENTRY_TAIL	2
+/*
+ * The 32-bit directory is 4MB (2^22) in size, and with 4-byte
+ * entries it has 2^20 entries.
+ */
+#define MPX_BD_SIZE_BYTES_32	(1UL<<22)
+#define MPX_BD_ENTRY_BYTES_32	4
+#define MPX_BD_NR_ENTRIES_32	(MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32)
 
-#endif
+/*
+ * A 64-bit table is 4MB total in size, and an entry is
+ * 4 64-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_64	(1UL<<22)
+#define MPX_BT_ENTRY_BYTES_64	32
+#define MPX_BT_NR_ENTRIES_64	(MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64)
 
-#define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT))
-#define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT))
+/*
+ * A 32-bit table is 16kB total in size, and an entry is
+ * 4 32-bit pointers in size.
+ */
+#define MPX_BT_SIZE_BYTES_32	(1UL<<14)
+#define MPX_BT_ENTRY_BYTES_32	16
+#define MPX_BT_NR_ENTRIES_32	(MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32)
 
 #define MPX_BNDSTA_TAIL		2
 #define MPX_BNDCFG_TAIL		12
 #define MPX_BNDSTA_ADDR_MASK	(~((1UL<<MPX_BNDSTA_TAIL)-1))
 #define MPX_BNDCFG_ADDR_MASK	(~((1UL<<MPX_BNDCFG_TAIL)-1))
-#define MPX_BT_ADDR_MASK	(~((1UL<<MPX_BD_ENTRY_TAIL)-1))
-
-#define MPX_BNDCFG_ADDR_MASK	(~((1UL<<MPX_BNDCFG_TAIL)-1))
 #define MPX_BNDSTA_ERROR_CODE	0x3
 
-#define MPX_BD_ENTRY_MASK	((1<<MPX_BD_ENTRY_OFFSET)-1)
-#define MPX_BT_ENTRY_MASK	((1<<MPX_BT_ENTRY_OFFSET)-1)
-#define MPX_GET_BD_ENTRY_OFFSET(addr)	((((addr)>>(MPX_BT_ENTRY_OFFSET+ \
-		MPX_IGN_BITS)) & MPX_BD_ENTRY_MASK) << MPX_BD_ENTRY_SHIFT)
-#define MPX_GET_BT_ENTRY_OFFSET(addr)	((((addr)>>MPX_IGN_BITS) & \
-		MPX_BT_ENTRY_MASK) << MPX_BT_ENTRY_SHIFT)
-
 #ifdef CONFIG_X86_INTEL_MPX
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-				struct xsave_struct *xsave_buf);
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf);
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs);
+int mpx_handle_bd_fault(void);
 static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
 {
 	return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR);
@@ -77,12 +72,11 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long start, unsigned long end);
 #else
-static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-					      struct xsave_struct *xsave_buf)
+static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
 	return NULL;
 }
-static inline int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+static inline int mpx_handle_bd_fault(void)
 {
 	return -EINVAL;
 }
diff --git a/arch/x86/include/asm/msi.h b/arch/x86/include/asm/msi.h
new file mode 100644
index 000000000000..93724cc62177
--- /dev/null
+++ b/arch/x86/include/asm/msi.h
@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_MSI_H
+#define _ASM_X86_MSI_H
+#include <asm/hw_irq.h>
+
+typedef struct irq_alloc_info msi_alloc_info_t;
+
+#endif /* _ASM_X86_MSI_H */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c469490db4a8..9ebc3d009373 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -56,6 +56,7 @@
 #define MSR_IA32_MCG_CAP		0x00000179
 #define MSR_IA32_MCG_STATUS		0x0000017a
 #define MSR_IA32_MCG_CTL		0x0000017b
+#define MSR_IA32_MCG_EXT_CTL		0x000004d0
 
 #define MSR_OFFCORE_RSP_0		0x000001a6
 #define MSR_OFFCORE_RSP_1		0x000001a7
@@ -140,6 +141,7 @@
 #define MSR_CORE_C3_RESIDENCY		0x000003fc
 #define MSR_CORE_C6_RESIDENCY		0x000003fd
 #define MSR_CORE_C7_RESIDENCY		0x000003fe
+#define MSR_KNL_CORE_C6_RESIDENCY	0x000003ff
 #define MSR_PKG_C2_RESIDENCY		0x0000060d
 #define MSR_PKG_C8_RESIDENCY		0x00000630
 #define MSR_PKG_C9_RESIDENCY		0x00000631
@@ -379,6 +381,7 @@
 #define FEATURE_CONTROL_LOCKED				(1<<0)
 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
 #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
+#define FEATURE_CONTROL_LMCE				(1<<20)
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index de36f22eb0b9..e6a707eb5081 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -1,13 +1,14 @@
 #ifndef _ASM_X86_MSR_H
 #define _ASM_X86_MSR_H
 
-#include <uapi/asm/msr.h>
+#include "msr-index.h"
 
 #ifndef __ASSEMBLY__
 
 #include <asm/asm.h>
 #include <asm/errno.h>
 #include <asm/cpumask.h>
+#include <uapi/asm/msr.h>
 
 struct msr {
 	union {
@@ -205,8 +206,13 @@ do {                                                            \
 
 #endif	/* !CONFIG_PARAVIRT */
 
-#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val),		\
-					     (u32)((val) >> 32))
+/*
+ * 64-bit version of wrmsr_safe():
+ */
+static inline int wrmsrl_safe(u32 msr, u64 val)
+{
+	return wrmsr_safe(msr, (u32)val,  (u32)(val >> 32));
+}
 
 #define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
 
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index f768f6298419..b94f6f64e23d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -31,7 +31,7 @@
  * arch_phys_wc_add and arch_phys_wc_del.
  */
 # ifdef CONFIG_MTRR
-extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
 extern int mtrr_add(unsigned long base, unsigned long size,
@@ -48,14 +48,13 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
-extern int phys_wc_to_mtrr_index(int handle);
 #  else
-static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
 	/*
 	 * Return no-MTRRs:
 	 */
-	return 0xff;
+	return MTRR_TYPE_INVALID;
 }
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_state() do {} while (0)
@@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 }
-static inline int phys_wc_to_mtrr_index(int handle)
-{
-	return -1;
-}
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
@@ -127,4 +122,8 @@ struct mtrr_gentry32 {
 				 _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
 #endif /* CONFIG_COMPAT */
 
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_MTRR_FIXED_ENABLED	0x01
+#define MTRR_STATE_MTRR_ENABLED		0x02
+
 #endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8957810ad7d1..d143bfad45d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+							u32 val)
+{
+	PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+	PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+	PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+	PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
 							__ticket_t ticket)
 {
@@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
 	PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
 
-#endif
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#endif /* SMP && PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_X86_32
 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index f7b0b5c112f2..a6b8f9fadb06 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -160,13 +160,14 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(int counter);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Atomically enable interrupts and return to userspace.  This
-	 * is only ever used to return to 32-bit processes; in a
-	 * 64-bit kernel, it's used for 32-on-64 compat processes, but
-	 * never native 64-bit processes.  (Jump, not call.)
+	 * is only used in 32-bit kernels.  64-bit kernels use
+	 * usergs_sysret32 instead.
 	 */
 	void (*irq_enable_sysexit)(void);
+#endif
 
 	/*
 	 * Switch to usermode gs and return to 64-bit usermode using
@@ -333,9 +334,19 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
 
+struct qspinlock;
+
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+	struct paravirt_callee_save queued_spin_unlock;
+
+	void (*wait)(u8 *ptr, u8 val);
+	void (*kick)(int cpu);
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	struct paravirt_callee_save lock_spinning;
 	void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 91bc4ba95f91..ca6c228d5e62 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -4,14 +4,9 @@
 #include <linux/types.h>
 #include <asm/pgtable_types.h>
 
-#ifdef CONFIG_X86_PAT
-extern int pat_enabled;
-#else
-static const int pat_enabled;
-#endif
-
+bool pat_enabled(void);
 extern void pat_init(void);
-void pat_init_cache_modes(void);
+void pat_init_cache_modes(u64);
 
 extern int reserve_memtype(u64 start, u64 end,
 		enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 4e370a5d8117..b962e0fe5658 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -80,13 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 
 #ifdef CONFIG_PCI
 extern void early_quirks(void);
-static inline void pci_dma_burst_advice(struct pci_dev *pdev,
-					enum pci_dma_burst_strategy *strat,
-					unsigned long *strategy_parameter)
-{
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
-}
 #else
 static inline void early_quirks(void) { }
 #endif
@@ -96,15 +89,10 @@ extern void pci_iommu_alloc(void);
 #ifdef CONFIG_PCI_MSI
 /* implemented in arch/x86/kernel/apic/io_apic. */
 struct msi_desc;
-void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
-			    unsigned int dest, struct msi_msg *msg, u8 hpet_id);
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset);
 #else
-#define native_compose_msi_msg		NULL
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #endif
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fe57e7a98839..2562e303405b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
 	 * requested memtype:
 	 * - request is uncached, return cannot be write-back
 	 * - request is write-combine, return cannot be write-back
+	 * - request is write-through, return cannot be write-back
+	 * - request is write-through, return cannot be write-combine
 	 */
 	if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
 	     new_pcm == _PAGE_CACHE_MODE_WB) ||
 	    (pcm == _PAGE_CACHE_MODE_WC &&
-	     new_pcm == _PAGE_CACHE_MODE_WB)) {
+	     new_pcm == _PAGE_CACHE_MODE_WB) ||
+	    (pcm == _PAGE_CACHE_MODE_WT &&
+	     new_pcm == _PAGE_CACHE_MODE_WB) ||
+	    (pcm == _PAGE_CACHE_MODE_WT &&
+	     new_pcm == _PAGE_CACHE_MODE_WC)) {
 		return 0;
 	}
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 78f0c8cbe316..13f310bfc09a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -367,6 +367,9 @@ extern int nx_enabled;
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
 
+#define pgprot_writethrough	pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
 /* Indicate that x86 has its own track and untrack pfn vma functions */
 #define __HAVE_PFNMAP_TRACKING
 
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 8f3271842533..dca71714f860 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -99,11 +99,9 @@ static __always_inline bool should_resched(void)
   extern asmlinkage void ___preempt_schedule(void);
 # define __preempt_schedule() asm ("call ___preempt_schedule")
   extern asmlinkage void preempt_schedule(void);
-# ifdef CONFIG_CONTEXT_TRACKING
-    extern asmlinkage void ___preempt_schedule_context(void);
-#   define __preempt_schedule_context() asm ("call ___preempt_schedule_context")
-    extern asmlinkage void preempt_schedule_context(void);
-# endif
+  extern asmlinkage void ___preempt_schedule_notrace(void);
+# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
+  extern asmlinkage void preempt_schedule_notrace(void);
 #endif
 
 #endif /* __ASM_PREEMPT_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 23ba6765b718..43e6519df0d5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -21,6 +21,7 @@ struct mm_struct;
 #include <asm/desc_defs.h>
 #include <asm/nops.h>
 #include <asm/special_insns.h>
+#include <asm/fpu/types.h>
 
 #include <linux/personality.h>
 #include <linux/cpumask.h>
@@ -52,11 +53,16 @@ static inline void *current_text_addr(void)
 	return pc;
 }
 
+/*
+ * These alignment constraints are for performance in the vSMP case,
+ * but in the task_struct case we must also meet hardware imposed
+ * alignment requirements of the FPU state:
+ */
 #ifdef CONFIG_X86_VSMP
 # define ARCH_MIN_TASKALIGN		(1 << INTERNODE_CACHE_SHIFT)
 # define ARCH_MIN_MMSTRUCT_ALIGN	(1 << INTERNODE_CACHE_SHIFT)
 #else
-# define ARCH_MIN_TASKALIGN		16
+# define ARCH_MIN_TASKALIGN		__alignof__(union fpregs_state)
 # define ARCH_MIN_MMSTRUCT_ALIGN	0
 #endif
 
@@ -166,7 +172,6 @@ extern const struct seq_operations cpuinfo_op;
 #define cache_line_size()	(boot_cpu_data.x86_cache_alignment)
 
 extern void cpu_detect(struct cpuinfo_x86 *c);
-extern void fpu_detect(struct cpuinfo_x86 *c);
 
 extern void early_cpu_init(void);
 extern void identify_boot_cpu(void);
@@ -313,128 +318,6 @@ struct orig_ist {
 	unsigned long		ist[7];
 };
 
-#define	MXCSR_DEFAULT		0x1f80
-
-struct i387_fsave_struct {
-	u32			cwd;	/* FPU Control Word		*/
-	u32			swd;	/* FPU Status Word		*/
-	u32			twd;	/* FPU Tag Word			*/
-	u32			fip;	/* FPU IP Offset		*/
-	u32			fcs;	/* FPU IP Selector		*/
-	u32			foo;	/* FPU Operand Pointer Offset	*/
-	u32			fos;	/* FPU Operand Pointer Selector	*/
-
-	/* 8*10 bytes for each FP-reg = 80 bytes:			*/
-	u32			st_space[20];
-
-	/* Software status information [not touched by FSAVE ]:		*/
-	u32			status;
-};
-
-struct i387_fxsave_struct {
-	u16			cwd; /* Control Word			*/
-	u16			swd; /* Status Word			*/
-	u16			twd; /* Tag Word			*/
-	u16			fop; /* Last Instruction Opcode		*/
-	union {
-		struct {
-			u64	rip; /* Instruction Pointer		*/
-			u64	rdp; /* Data Pointer			*/
-		};
-		struct {
-			u32	fip; /* FPU IP Offset			*/
-			u32	fcs; /* FPU IP Selector			*/
-			u32	foo; /* FPU Operand Offset		*/
-			u32	fos; /* FPU Operand Selector		*/
-		};
-	};
-	u32			mxcsr;		/* MXCSR Register State */
-	u32			mxcsr_mask;	/* MXCSR Mask		*/
-
-	/* 8*16 bytes for each FP-reg = 128 bytes:			*/
-	u32			st_space[32];
-
-	/* 16*16 bytes for each XMM-reg = 256 bytes:			*/
-	u32			xmm_space[64];
-
-	u32			padding[12];
-
-	union {
-		u32		padding1[12];
-		u32		sw_reserved[12];
-	};
-
-} __attribute__((aligned(16)));
-
-struct i387_soft_struct {
-	u32			cwd;
-	u32			swd;
-	u32			twd;
-	u32			fip;
-	u32			fcs;
-	u32			foo;
-	u32			fos;
-	/* 8*10 bytes for each FP-reg = 80 bytes: */
-	u32			st_space[20];
-	u8			ftop;
-	u8			changed;
-	u8			lookahead;
-	u8			no_update;
-	u8			rm;
-	u8			alimit;
-	struct math_emu_info	*info;
-	u32			entry_eip;
-};
-
-struct ymmh_struct {
-	/* 16 * 16 bytes for each YMMH-reg = 256 bytes */
-	u32 ymmh_space[64];
-};
-
-/* We don't support LWP yet: */
-struct lwp_struct {
-	u8 reserved[128];
-};
-
-struct bndreg {
-	u64 lower_bound;
-	u64 upper_bound;
-} __packed;
-
-struct bndcsr {
-	u64 bndcfgu;
-	u64 bndstatus;
-} __packed;
-
-struct xsave_hdr_struct {
-	u64 xstate_bv;
-	u64 xcomp_bv;
-	u64 reserved[6];
-} __attribute__((packed));
-
-struct xsave_struct {
-	struct i387_fxsave_struct i387;
-	struct xsave_hdr_struct xsave_hdr;
-	struct ymmh_struct ymmh;
-	struct lwp_struct lwp;
-	struct bndreg bndreg[4];
-	struct bndcsr bndcsr;
-	/* new processor state extensions will go here */
-} __attribute__ ((packed, aligned (64)));
-
-union thread_xstate {
-	struct i387_fsave_struct	fsave;
-	struct i387_fxsave_struct	fxsave;
-	struct i387_soft_struct		soft;
-	struct xsave_struct		xsave;
-};
-
-struct fpu {
-	unsigned int last_cpu;
-	unsigned int has_fpu;
-	union thread_xstate *state;
-};
-
 #ifdef CONFIG_X86_64
 DECLARE_PER_CPU(struct orig_ist, orig_ist);
 
@@ -483,8 +366,6 @@ DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
 #endif	/* X86_64 */
 
 extern unsigned int xstate_size;
-extern void free_thread_xstate(struct task_struct *);
-extern struct kmem_cache *task_xstate_cachep;
 
 struct perf_event;
 
@@ -508,6 +389,10 @@ struct thread_struct {
 	unsigned long		fs;
 #endif
 	unsigned long		gs;
+
+	/* Floating point and extended processor state */
+	struct fpu		fpu;
+
 	/* Save middle states of ptrace breakpoints */
 	struct perf_event	*ptrace_bps[HBP_NUM];
 	/* Debug status used for traps, single steps, etc... */
@@ -518,8 +403,6 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-	/* floating point and extended processor state */
-	struct fpu		fpu;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
 	struct vm86_struct __user *vm86_info;
@@ -535,15 +418,6 @@ struct thread_struct {
 	unsigned long		iopl;
 	/* Max allowed port in the bitmap, in bytes: */
 	unsigned		io_bitmap_max;
-	/*
-	 * fpu_counter contains the number of consecutive context switches
-	 * that the FPU is used. If this is over a threshold, the lazy fpu
-	 * saving becomes unlazy to save the trap. This is an unsigned char
-	 * so that after 256 times the counter wraps and the behavior turns
-	 * lazy again; this to deal with bursty apps that only use FPU for
-	 * a short time
-	 */
-	unsigned char fpu_counter;
 };
 
 /*
@@ -928,24 +802,25 @@ extern int get_tsc_mode(unsigned long adr);
 extern int set_tsc_mode(unsigned int val);
 
 /* Register/unregister a process' MPX related resource */
-#define MPX_ENABLE_MANAGEMENT(tsk)	mpx_enable_management((tsk))
-#define MPX_DISABLE_MANAGEMENT(tsk)	mpx_disable_management((tsk))
+#define MPX_ENABLE_MANAGEMENT()	mpx_enable_management()
+#define MPX_DISABLE_MANAGEMENT()	mpx_disable_management()
 
 #ifdef CONFIG_X86_INTEL_MPX
-extern int mpx_enable_management(struct task_struct *tsk);
-extern int mpx_disable_management(struct task_struct *tsk);
+extern int mpx_enable_management(void);
+extern int mpx_disable_management(void);
 #else
-static inline int mpx_enable_management(struct task_struct *tsk)
+static inline int mpx_enable_management(void)
 {
 	return -EINVAL;
 }
-static inline int mpx_disable_management(struct task_struct *tsk)
+static inline int mpx_disable_management(void)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_X86_INTEL_MPX */
 
 extern u16 amd_get_nb_id(int cpu);
+extern u32 amd_get_nodes_per_socket(void);
 
 static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
 {
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a90f8972dad5..a4a77286cb1d 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,12 +5,14 @@
 
 /* misc architecture specific prototypes */
 
-void system_call(void);
 void syscall_init(void);
 
-void ia32_syscall(void);
-void ia32_cstar_target(void);
-void ia32_sysenter_target(void);
+void entry_SYSCALL_64(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_32(void);
+void entry_INT80_compat(void);
+void entry_SYSENTER_32(void);
+void entry_SYSENTER_compat(void);
 
 void x86_configure_nx(void);
 void x86_report_nx(void);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 19507ffa5d28..5fabf1362942 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -107,7 +107,7 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 static inline int user_mode(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
-	return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+	return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL;
 #else
 	return !!(regs->cs & 3);
 #endif
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6167fd798188..655e07a48f6c 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -41,5 +41,6 @@ struct pvclock_wall_clock {
 
 #define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
 #define PVCLOCK_GUEST_STOPPED	(1 << 1)
+#define PVCLOCK_COUNTS_FROM_ZERO (1 << 2)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..628954ceede1 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -86,7 +86,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 	offset = pvclock_get_nsec_offset(src);
 	ret = src->system_time + offset;
 	ret_flags = src->flags;
-	rdtsc_barrier();
 
 	*cycles = ret;
 	*flags = ret_flags;
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000000000000..9d51fae1cba3
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+
+#define	queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+	smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+	pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+#endif
+
+#define virt_queued_spin_lock virt_queued_spin_lock
+
+static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+	if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+		return false;
+
+	while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
+		cpu_relax();
+
+	return true;
+}
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..b002e711ba88
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5a9856eb12ba..7d5a1929d76b 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -231,11 +231,21 @@
 #define TLS_SIZE			(GDT_ENTRY_TLS_ENTRIES* 8)
 
 #ifdef __KERNEL__
+
+/*
+ * early_idt_handler_array is an array of entry points referenced in the
+ * early IDT.  For simplicity, it's a real array with one entry point
+ * every nine bytes.  That leaves room for an optional 'push $0' if the
+ * vector has no error code (two bytes), a 'push $vector_number' (two
+ * bytes), and a jump to the common entry code (up to five bytes).
+ */
+#define EARLY_IDT_HANDLER_SIZE 9
+
 #ifndef __ASSEMBLY__
 
-extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
+extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
 #ifdef CONFIG_TRACING
-# define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handler_array early_idt_handler_array
 #endif
 
 /*
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f69e06b283fb..11af24e09c8a 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -60,17 +60,24 @@ static inline void x86_ce4100_early_setup(void) { }
 #ifndef _SETUP
 
 #include <asm/espfix.h>
+#include <linux/kernel.h>
 
 /*
  * This is set up by the setup-routine at boot-time
  */
 extern struct boot_params boot_params;
+extern char _text[];
 
 static inline bool kaslr_enabled(void)
 {
 	return !!(boot_params.hdr.loadflags & KASLR_FLAG);
 }
 
+static inline unsigned long kaslr_offset(void)
+{
+	return (unsigned long)&_text - __START_KERNEL;
+}
+
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/include/asm/simd.h b/arch/x86/include/asm/simd.h
index ee80b92f0096..6c8a7ed13365 100644
--- a/arch/x86/include/asm/simd.h
+++ b/arch/x86/include/asm/simd.h
@@ -1,5 +1,5 @@
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * may_use_simd - whether it is allowable at this time to issue SIMD
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 17a8dced12da..222a6a3ca2b5 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -37,16 +37,6 @@ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
 DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
 
-static inline struct cpumask *cpu_sibling_mask(int cpu)
-{
-	return per_cpu(cpu_sibling_map, cpu);
-}
-
-static inline struct cpumask *cpu_core_mask(int cpu)
-{
-	return per_cpu(cpu_core_map, cpu);
-}
-
 static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return per_cpu(cpu_llc_shared_map, cpu);
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index aeb4666e0c0a..2270e41b32fd 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p)
 		: [pax] "a" (p));
 }
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ *         unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ *         void *vend = vaddr + size;
+ *         void *p;
+ *
+ *         for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ *              p < vend; p += boot_cpu_data.x86_clflush_size)
+ *                 clwb(p);
+ *
+ *         // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ *         // MFENCE via mb() also works
+ *         wmb();
+ *
+ *         // PCOMMIT and the required SFENCE for ordering
+ *         pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 {
 	alternative(ASM_NOP7,
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 64b611782ef0..be0a05913b91 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -42,6 +42,10 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 		cpu_relax();
 	}
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 /*
  * Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 5f9d7572d82b..65c3e37f879a 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
 
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 typedef struct arch_spinlock {
 	union {
 		__ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 #include <asm-generic/qrwlock_types.h>
 
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 6a998598f172..c2e00bb2a136 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -39,7 +39,9 @@
 #include <asm/processor.h>
 #include <asm/percpu.h>
 #include <asm/desc.h>
+
 #include <linux/random.h>
+#include <linux/sched.h>
 
 /*
  * 24 byte read-only segment initializer for stack canary.  Linker
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 552d6c90a6d4..d1793f06854d 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -7,7 +7,7 @@
 #define _ASM_X86_SUSPEND_32_H
 
 #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /* image of the saved processor state */
 struct saved_context {
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index bc6232834bab..7ebf0ebe4e68 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -7,7 +7,7 @@
 #define _ASM_X86_SUSPEND_64_H
 
 #include <asm/desc.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 /*
  * Image of the saved processor state, used by the low level ACPI suspend to
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index b4bdec3e9523..225ee545e1a0 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -177,8 +177,6 @@ struct thread_info {
  */
 #ifndef __ASSEMBLY__
 
-DECLARE_PER_CPU(unsigned long, kernel_stack);
-
 static inline struct thread_info *current_thread_info(void)
 {
 	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
@@ -197,9 +195,13 @@ static inline unsigned long current_stack_pointer(void)
 
 #else /* !__ASSEMBLY__ */
 
+#ifdef CONFIG_X86_64
+# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+#endif
+
 /* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
-	_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
+	_ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
 	_ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f2c26f..0fb46482dfde 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -26,7 +26,7 @@
 #define _ASM_X86_TOPOLOGY_H
 
 #ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
 #  define ENABLE_TOPO_DEFINES
 # endif
 #else
@@ -124,7 +124,7 @@ extern const struct cpumask *cpu_coregroup_mask(int cpu);
 
 #ifdef ENABLE_TOPO_DEFINES
 #define topology_core_cpumask(cpu)		(per_cpu(cpu_core_map, cpu))
-#define topology_thread_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
+#define topology_sibling_cpumask(cpu)		(per_cpu(cpu_sibling_map, cpu))
 #endif
 
 static inline void arch_fix_phys_package_id(int num, u32 slot)
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 4cab890007a7..38a09a13a9bc 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -101,6 +101,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single);
 DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
 
 /*
+ * deferred_error_apic - called when entering/exiting a deferred apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+
+/*
  * thermal_apic - called when entering/exiting a thermal apic interrupt
  * vector handler
  */
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
new file mode 100644
index 000000000000..173dd3ba108c
--- /dev/null
+++ b/arch/x86/include/asm/trace/mpx.h
@@ -0,0 +1,132 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mpx
+
+#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MPX_H
+
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_X86_INTEL_MPX
+
+TRACE_EVENT(mpx_bounds_register_exception,
+
+	TP_PROTO(void *addr_referenced,
+		 const struct bndreg *bndreg),
+	TP_ARGS(addr_referenced, bndreg),
+
+	TP_STRUCT__entry(
+		__field(void *, addr_referenced)
+		__field(u64, lower_bound)
+		__field(u64, upper_bound)
+	),
+
+	TP_fast_assign(
+		__entry->addr_referenced = addr_referenced;
+		__entry->lower_bound = bndreg->lower_bound;
+		__entry->upper_bound = bndreg->upper_bound;
+	),
+	/*
+	 * Note that we are printing out the '~' of the upper
+	 * bounds register here.  It is actually stored in its
+	 * one's complement form so that its 'init' state
+	 * corresponds to all 0's.  But, that looks like
+	 * gibberish when printed out, so print out the 1's
+	 * complement instead of the actual value here.  Note
+	 * though that you still need to specify filters for the
+	 * actual value, not the displayed one.
+	 */
+	TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx",
+		__entry->addr_referenced,
+		__entry->lower_bound,
+		~__entry->upper_bound
+	)
+);
+
+TRACE_EVENT(bounds_exception_mpx,
+
+	TP_PROTO(const struct bndcsr *bndcsr),
+	TP_ARGS(bndcsr),
+
+	TP_STRUCT__entry(
+		__field(u64, bndcfgu)
+		__field(u64, bndstatus)
+	),
+
+	TP_fast_assign(
+		/* need to get rid of the 'const' on bndcsr */
+		__entry->bndcfgu   = (u64)bndcsr->bndcfgu;
+		__entry->bndstatus = (u64)bndcsr->bndstatus;
+	),
+
+	TP_printk("bndcfgu:0x%llx bndstatus:0x%llx",
+		__entry->bndcfgu,
+		__entry->bndstatus)
+);
+
+DECLARE_EVENT_CLASS(mpx_range_trace,
+
+	TP_PROTO(unsigned long start,
+		 unsigned long end),
+	TP_ARGS(start, end),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, start)
+		__field(unsigned long, end)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end   = end;
+	),
+
+	TP_printk("[0x%p:0x%p]",
+		(void *)__entry->start,
+		(void *)__entry->end
+	)
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap,
+	TP_PROTO(unsigned long start, unsigned long end),
+	TP_ARGS(start, end)
+);
+
+DEFINE_EVENT(mpx_range_trace, mpx_unmap_search,
+	TP_PROTO(unsigned long start, unsigned long end),
+	TP_ARGS(start, end)
+);
+
+TRACE_EVENT(mpx_new_bounds_table,
+
+	TP_PROTO(unsigned long table_vaddr),
+	TP_ARGS(table_vaddr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, table_vaddr)
+	),
+
+	TP_fast_assign(
+		__entry->table_vaddr = table_vaddr;
+	),
+
+	TP_printk("table vaddr:%p", (void *)__entry->table_vaddr)
+);
+
+#else
+
+/*
+ * This gets used outside of MPX-specific code, so we need a stub.
+ */
+static inline void trace_bounds_exception_mpx(const struct bndcsr *bndcsr)
+{
+}
+
+#endif /* CONFIG_X86_INTEL_MPX */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH asm/trace/
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mpx
+#endif /* _TRACE_MPX_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4e49d7dff78e..c5380bea2a36 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi;
 void math_emulate(struct math_emu_info *);
 #ifndef CONFIG_X86_32
 asmlinkage void smp_thermal_interrupt(void);
-asmlinkage void mce_threshold_interrupt(void);
+asmlinkage void smp_threshold_interrupt(void);
+asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
 extern enum ctx_state ist_enter(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ace9dec050b1..a8df874f3e88 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -74,7 +74,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
  * @addr: User space pointer to start of block to check
  * @size: Size of block to check
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Checks if a pointer to a block of memory in user space is valid.
  *
@@ -145,7 +146,8 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -240,7 +242,8 @@ extern void __put_user_8(void);
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
@@ -455,7 +458,8 @@ struct __large_struct { unsigned long buf[100]; };
  * @x:   Variable to store result.
  * @ptr: Source address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple variable from user space to kernel
  * space.  It supports simple types like char and int, but not larger
@@ -479,7 +483,8 @@ struct __large_struct { unsigned long buf[100]; };
  * @x:   Value to copy to user space.
  * @ptr: Destination address, in user space.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * This macro copies a single simple value from kernel space to user
  * space.  It supports simple types like char and int, but not larger
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 3c03a5de64d3..f5dcb5204dcd 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
 			return ret;
+		case 8:
+			__put_user_size(*(u64 *)from, (u64 __user *)to,
+					8, ret, 8);
+			return ret;
 		}
 	}
 	return __copy_to_user_ll(to, from, n);
@@ -70,7 +74,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.  Caller must check
  * the specified block with access_ok() before calling this function.
@@ -117,7 +122,8 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.  Caller must check
  * the specified block with access_ok() before calling this function.
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index ccab4af1646d..59a54e869f15 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -14,8 +14,8 @@ struct user_ymmh_regs {
 	__u32 ymmh_space[64];
 };
 
-struct user_xsave_hdr {
-	__u64 xstate_bv;
+struct user_xstate_header {
+	__u64 xfeatures;
 	__u64 reserved1[2];
 	__u64 reserved2[5];
 };
@@ -41,11 +41,11 @@ struct user_xsave_hdr {
  * particular process/thread.
  *
  * Also when the user modifies certain state FP/SSE/etc through the
- * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
+ * ptrace interface, they must ensure that the header.xfeatures
  * bytes[512..519] of the memory layout are updated correspondingly.
  * i.e., for example when FP state is modified to a non-init state,
- * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
- * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
+ * header.xfeatures's bit 0 must be set to '1', when SSE is modified to
+ * non-init state, header.xfeatures's bit 1 must to be set to '1', etc.
  */
 #define USER_XSTATE_FX_SW_WORDS 6
 #define USER_XSTATE_XCR0_WORD	0
@@ -55,7 +55,7 @@ struct user_xstateregs {
 		__u64 fpx_space[58];
 		__u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
 	} i387;
-	struct user_xsave_hdr xsave_hdr;
+	struct user_xstate_header header;
 	struct user_ymmh_regs ymmh;
 	/* further processor state extensions go here */
 };
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index f58a9c7a3c86..48d34d28f5a6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -171,38 +171,17 @@ struct x86_platform_ops {
 };
 
 struct pci_dev;
-struct msi_msg;
 
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
-	void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq,
-				unsigned int dest, struct msi_msg *msg,
-			       u8 hpet_id);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
-	int  (*setup_hpet_msi)(unsigned int irq, unsigned int id);
 };
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_data;
-struct cpumask;
-
 struct x86_io_apic_ops {
-	void		(*init)   (void);
 	unsigned int	(*read)   (unsigned int apic, unsigned int reg);
-	void		(*write)  (unsigned int apic, unsigned int reg, unsigned int value);
-	void		(*modify) (unsigned int apic, unsigned int reg, unsigned int value);
 	void		(*disable)(void);
-	void		(*print_entries)(unsigned int apic, unsigned int nr_entries);
-	int		(*set_affinity)(struct irq_data *data,
-					const struct cpumask *mask,
-					bool force);
-	int		(*setup_entry)(int irq, struct IO_APIC_route_entry *entry,
-				       unsigned int destination, int vector,
-				       struct io_apic_irq_attr *attr);
-	void		(*eoi_ioapic_pin)(int apic, int pin, int vector);
 };
 
 extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
deleted file mode 100644
index f2cba4e79a23..000000000000
--- a/arch/x86/include/asm/xcr.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *
- *   Copyright 2008 rPath, Inc. - All Rights Reserved
- *
- *   This file is part of the Linux kernel, and is made available under
- *   the terms of the GNU General Public License version 2 or (at your
- *   option) any later version; incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
-
-/*
- * asm-x86/xcr.h
- *
- * Definitions for the eXtended Control Register instructions
- */
-
-#ifndef _ASM_X86_XCR_H
-#define _ASM_X86_XCR_H
-
-#define XCR_XFEATURE_ENABLED_MASK	0x00000000
-
-#ifdef __KERNEL__
-# ifndef __ASSEMBLY__
-
-#include <linux/types.h>
-
-static inline u64 xgetbv(u32 index)
-{
-	u32 eax, edx;
-
-	asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
-		     : "=a" (eax), "=d" (edx)
-		     : "c" (index));
-	return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
-	u32 eax = value;
-	u32 edx = value >> 32;
-
-	asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
-		     : : "a" (eax), "d" (edx), "c" (index));
-}
-
-# endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index d8829751b3f8..1f5c5161ead6 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -36,7 +36,7 @@
  * no advantages to be gotten from x86-64 here anyways.
  */
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #ifdef CONFIG_X86_32
 /* reduce register pressure */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index ce05722e3c68..5a08bc8bff33 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -26,7 +26,7 @@
 #define XO3(x, y)	"       pxor   8*("#x")(%4), %%mm"#y"   ;\n"
 #define XO4(x, y)	"       pxor   8*("#x")(%5), %%mm"#y"   ;\n"
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 static void
 xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h
index 492b29802f57..7c0a517ec751 100644
--- a/arch/x86/include/asm/xor_avx.h
+++ b/arch/x86/include/asm/xor_avx.h
@@ -18,7 +18,7 @@
 #ifdef CONFIG_AS_AVX
 
 #include <linux/compiler.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 
 #define BLOCK4(i) \
 		BLOCK(32 * i, 0) \
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
deleted file mode 100644
index c9a6d68b8d62..000000000000
--- a/arch/x86/include/asm/xsave.h
+++ /dev/null
@@ -1,257 +0,0 @@
-#ifndef __ASM_X86_XSAVE_H
-#define __ASM_X86_XSAVE_H
-
-#include <linux/types.h>
-#include <asm/processor.h>
-
-#define XSTATE_CPUID		0x0000000d
-
-#define XSTATE_FP		0x1
-#define XSTATE_SSE		0x2
-#define XSTATE_YMM		0x4
-#define XSTATE_BNDREGS		0x8
-#define XSTATE_BNDCSR		0x10
-#define XSTATE_OPMASK		0x20
-#define XSTATE_ZMM_Hi256	0x40
-#define XSTATE_Hi16_ZMM		0x80
-
-#define XSTATE_FPSSE	(XSTATE_FP | XSTATE_SSE)
-#define XSTATE_AVX512	(XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-/* Bit 63 of XCR0 is reserved for future expansion */
-#define XSTATE_EXTEND_MASK	(~(XSTATE_FPSSE | (1ULL << 63)))
-
-#define FXSAVE_SIZE	512
-
-#define XSAVE_HDR_SIZE	    64
-#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
-
-#define XSAVE_YMM_SIZE	    256
-#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-
-/* Supported features which support lazy state saving */
-#define XSTATE_LAZY	(XSTATE_FP | XSTATE_SSE | XSTATE_YMM		      \
-			| XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
-
-/* Supported features which require eager state saving */
-#define XSTATE_EAGER	(XSTATE_BNDREGS | XSTATE_BNDCSR)
-
-/* All currently supported features */
-#define XCNTXT_MASK	(XSTATE_LAZY | XSTATE_EAGER)
-
-#ifdef CONFIG_X86_64
-#define REX_PREFIX	"0x48, "
-#else
-#define REX_PREFIX
-#endif
-
-extern unsigned int xstate_size;
-extern u64 pcntxt_mask;
-extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
-extern struct xsave_struct *init_xstate_buf;
-
-extern void xsave_init(void);
-extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
-extern int init_fpu(struct task_struct *child);
-
-/* These macros all use (%edi)/(%rdi) as the single memory argument. */
-#define XSAVE		".byte " REX_PREFIX "0x0f,0xae,0x27"
-#define XSAVEOPT	".byte " REX_PREFIX "0x0f,0xae,0x37"
-#define XSAVES		".byte " REX_PREFIX "0x0f,0xc7,0x2f"
-#define XRSTOR		".byte " REX_PREFIX "0x0f,0xae,0x2f"
-#define XRSTORS		".byte " REX_PREFIX "0x0f,0xc7,0x1f"
-
-#define xstate_fault	".section .fixup,\"ax\"\n"	\
-			"3:  movl $-1,%[err]\n"		\
-			"    jmp  2b\n"			\
-			".previous\n"			\
-			_ASM_EXTABLE(1b, 3b)		\
-			: [err] "=r" (err)
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XSAVES"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	else
-		asm volatile("1:"XSAVE"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	return err;
-}
-
-/*
- * This function is called only during boot time when x86 caps are not set
- * up and alternative can not be used yet.
- */
-static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	WARN_ON(system_state != SYSTEM_BOOTING);
-
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		asm volatile("1:"XRSTORS"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	else
-		asm volatile("1:"XRSTOR"\n\t"
-			"2:\n\t"
-			     xstate_fault
-			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-			:   "memory");
-	return err;
-}
-
-/*
- * Save processor xstate to xsave area.
- */
-static inline int xsave_state(struct xsave_struct *fx, u64 mask)
-{
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-	int err = 0;
-
-	/*
-	 * If xsaves is enabled, xsaves replaces xsaveopt because
-	 * it supports compact format and supervisor states in addition to
-	 * modified optimization in xsaveopt.
-	 *
-	 * Otherwise, if xsaveopt is enabled, xsaveopt replaces xsave
-	 * because xsaveopt supports modified optimization which is not
-	 * supported by xsave.
-	 *
-	 * If none of xsaves and xsaveopt is enabled, use xsave.
-	 */
-	alternative_input_2(
-		"1:"XSAVE,
-		XSAVEOPT,
-		X86_FEATURE_XSAVEOPT,
-		XSAVES,
-		X86_FEATURE_XSAVES,
-		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
-		"memory");
-	asm volatile("2:\n\t"
-		     xstate_fault
-		     : "0" (0)
-		     : "memory");
-
-	return err;
-}
-
-/*
- * Restore processor xstate from xsave area.
- */
-static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
-{
-	int err = 0;
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	/*
-	 * Use xrstors to restore context if it is enabled. xrstors supports
-	 * compacted format of xsave area which is not supported by xrstor.
-	 */
-	alternative_input(
-		"1: " XRSTOR,
-		XRSTORS,
-		X86_FEATURE_XSAVES,
-		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
-		: "memory");
-
-	asm volatile("2:\n"
-		     xstate_fault
-		     : "0" (0)
-		     : "memory");
-
-	return err;
-}
-
-/*
- * Save xstate context for old process during context switch.
- */
-static inline void fpu_xsave(struct fpu *fpu)
-{
-	xsave_state(&fpu->state->xsave, -1);
-}
-
-/*
- * Restore xstate context for new process during context switch.
- */
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
-{
-	return xrstor_state(fx, -1);
-}
-
-/*
- * Save xstate to user space xsave area.
- *
- * We don't use modified optimization because xrstor/xrstors might track
- * a different application.
- *
- * We don't use compacted format xsave area for
- * backward compatibility for old applications which don't understand
- * compacted format of xsave area.
- */
-static inline int xsave_user(struct xsave_struct __user *buf)
-{
-	int err;
-
-	/*
-	 * Clear the xsave header first, so that reserved fields are
-	 * initialized to zero.
-	 */
-	err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr));
-	if (unlikely(err))
-		return -EFAULT;
-
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XSAVE"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault
-			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)
-			     : "memory");
-	return err;
-}
-
-/*
- * Restore xstate from user space xsave area.
- */
-static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
-{
-	int err = 0;
-	struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
-	u32 lmask = mask;
-	u32 hmask = mask >> 32;
-
-	__asm__ __volatile__(ASM_STAC "\n"
-			     "1:"XRSTOR"\n"
-			     "2: " ASM_CLAC "\n"
-			     xstate_fault
-			     : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
-			     : "memory");	/* memory required? */
-	return err;
-}
-
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate);
-void setup_xstate_comp(void);
-
-#endif
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index d7dcef58aefa..a4ae82eb82aa 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -106,6 +106,8 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_IOAPIC       2
 #define KVM_NR_IRQCHIPS          3
 
+#define KVM_RUN_X86_SMM		 (1 << 0)
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
 	/* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
@@ -281,6 +283,7 @@ struct kvm_reinject_control {
 #define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
 #define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
+#define KVM_VCPUEVENT_VALID_SMM		0x00000008
 
 /* Interrupt shadow states */
 #define KVM_X86_SHADOW_INT_MOV_SS	0x01
@@ -309,7 +312,13 @@ struct kvm_vcpu_events {
 	} nmi;
 	__u32 sipi_vector;
 	__u32 flags;
-	__u32 reserved[10];
+	struct {
+		__u8 smm;
+		__u8 pending;
+		__u8 smm_inside_nmi;
+		__u8 latched_init;
+	} smi;
+	__u32 reserved[9];
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
@@ -345,4 +354,7 @@ struct kvm_xcrs {
 struct kvm_sync_regs {
 };
 
+#define KVM_QUIRK_LINT0_REENABLED	(1 << 0)
+#define KVM_QUIRK_CD_NW_CLEARED		(1 << 1)
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h
index 155e51048fa4..c41f4fe25483 100644
--- a/arch/x86/include/uapi/asm/msr.h
+++ b/arch/x86/include/uapi/asm/msr.h
@@ -1,8 +1,6 @@
 #ifndef _UAPI_ASM_X86_MSR_H
 #define _UAPI_ASM_X86_MSR_H
 
-#include <asm/msr-index.h>
-
 #ifndef __ASSEMBLY__
 
 #include <linux/types.h>
diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h
index d0acb658c8f4..7528dcf59691 100644
--- a/arch/x86/include/uapi/asm/mtrr.h
+++ b/arch/x86/include/uapi/asm/mtrr.h
@@ -103,7 +103,7 @@ struct mtrr_state_type {
 #define MTRRIOC_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
 #define MTRRIOC_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry)
 
-/*  These are the region types  */
+/* MTRR memory types, which are defined in SDM */
 #define MTRR_TYPE_UNCACHABLE 0
 #define MTRR_TYPE_WRCOMB     1
 /*#define MTRR_TYPE_         2*/
@@ -113,5 +113,11 @@ struct mtrr_state_type {
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_NUM_TYPES       7
 
+/*
+ * Invalid MTRR memory type.  mtrr_type_lookup() returns this value when
+ * MTRRs are disabled.  Note, this value is allocated from the reserved
+ * values (0x7-0xff) of the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID    0xff
 
 #endif /* _UAPI_ASM_X86_MTRR_H */
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 16dc4e8a2cd3..0e8a973de9ee 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -25,7 +25,7 @@ struct _fpx_sw_bytes {
 	__u32 extended_size;	/* total size of the layout referred by
 				 * fpstate pointer in the sigcontext.
 				 */
-	__u64 xstate_bv;
+	__u64 xfeatures;
 				/* feature bit mask (including fp/sse/extended
 				 * state) that is present in the memory
 				 * layout.
@@ -209,8 +209,8 @@ struct sigcontext {
 
 #endif /* !__i386__ */
 
-struct _xsave_hdr {
-	__u64 xstate_bv;
+struct _header {
+	__u64 xfeatures;
 	__u64 reserved1[2];
 	__u64 reserved2[5];
 };
@@ -228,7 +228,7 @@ struct _ymmh_state {
  */
 struct _xstate {
 	struct _fpstate fpstate;
-	struct _xsave_hdr xstate_hdr;
+	struct _header xstate_hdr;
 	struct _ymmh_state ymmh;
 	/* new processor state extensions go here */
 };
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9bcd0b56ca17..0f15af41bd80 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
-obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
+obj-y			:= process_$(BITS).o signal.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
@@ -31,9 +31,6 @@ obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= mcount_64.o
-obj-y			+= syscall_$(BITS).o vsyscall_gtod.o
-obj-$(CONFIG_IA32_EMULATION)	+= syscall_32.o
-obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
 obj-$(CONFIG_X86_ESPFIX64)	+= espfix_64.o
 obj-$(CONFIG_SYSFS)	+= ksysfs.o
 obj-y			+= bootflag.o e820.o
@@ -44,7 +41,7 @@ obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 
 obj-y				+= process.o
-obj-y				+= i387.o xsave.o
+obj-y				+= fpu/
 obj-y				+= ptrace.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dbe76a14c3c9..e49ee24da85e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,12 +31,12 @@
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 
+#include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 	return 0;
 }
 
-static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
-			   int polarity)
-{
-	int irq, node;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-
-	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
-	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
-	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-	if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
-		pr_warn("Failed to set pin attr for GSI%d\n", gsi);
-		return -1;
-	}
-
-	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
-	if (irq < 0)
-		return irq;
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi)
-		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
-	return irq;
-}
-
-static void mp_unregister_gsi(u32 gsi)
-{
-	int irq;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return;
-
-	irq = mp_map_gsi_to_irq(gsi, 0);
-	if (irq > 0)
-		mp_unmap_irq(irq);
-}
-
-static struct irq_domain_ops acpi_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	ioapic = (struct acpi_madt_io_apic *)header;
@@ -652,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (trigger == ACPI_LEVEL_SENSITIVE)
-		eisa_set_level_irq(gsi);
+		elcr_set_level_irq(gsi);
 #endif
 
 	return gsi;
@@ -663,10 +619,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 				    int trigger, int polarity)
 {
 	int irq = gsi;
-
 #ifdef CONFIG_X86_IO_APIC
+	int node;
+	struct irq_alloc_info info;
+
+	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+	ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
 	mutex_lock(&acpi_ioapic_lock);
-	irq = mp_register_gsi(dev, gsi, trigger, polarity);
+	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (irq >= 0 && enable_update_mptable &&
+	    acpi_gbl_FADT.sci_interrupt != gsi)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 
@@ -676,8 +643,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 static void acpi_unregister_gsi_ioapic(u32 gsi)
 {
 #ifdef CONFIG_X86_IO_APIC
+	int irq;
+
 	mutex_lock(&acpi_ioapic_lock);
-	mp_unregister_gsi(gsi);
+	irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+	if (irq > 0)
+		mp_unmap_irq(irq);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 }
@@ -786,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
 	u64 addr;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 665c6b7d2ea9..0c26b1b44e51 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -12,11 +12,13 @@ ENTRY(wakeup_pmode_return)
 wakeup_pmode_return:
 	movw	$__KERNEL_DS, %ax
 	movw	%ax, %ss
-	movw	%ax, %ds
-	movw	%ax, %es
 	movw	%ax, %fs
 	movw	%ax, %gs
 
+	movw	$__USER_DS, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+
 	# reload the gdt, as we need the full 32 bit address
 	lidt	saved_idt
 	lldt	saved_ldt
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index ae693b51ed8e..8c35df468104 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel)
 	pushfq
 	popq	pt_regs_flags(%rax)
 
-	movq	$resume_point, saved_rip(%rip)
+	movq	$.Lresume_point, saved_rip(%rip)
 
 	movq	%rsp, saved_rsp
 	movq	%rbp, saved_rbp
@@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel)
 	xorl	%eax, %eax
 	call	x86_acpi_enter_sleep_state
 	/* in case something went wrong, restore the machine status and go on */
-	jmp	resume_point
+	jmp	.Lresume_point
 
 	.align 4
-resume_point:
+.Lresume_point:
 	/* We don't restore %rax, it must be 0 anyway */
 	movq	$saved_context, %rax
 	movq	saved_context_cr4(%rax), %rbx
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index aef653193160..c42827eb86cf 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -21,6 +21,10 @@
 #include <asm/io.h>
 #include <asm/fixmap.h>
 
+int __read_mostly alternatives_patched;
+
+EXPORT_SYMBOL_GPL(alternatives_patched);
+
 #define MAX_PATCH_LEN (255-1)
 
 static int __initdata_or_module debug_alternative;
@@ -227,6 +231,15 @@ void __init arch_init_ideal_nops(void)
 #endif
 		}
 		break;
+
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 > 0xf) {
+			ideal_nops = p6_nops;
+			return;
+		}
+
+		/* fall through */
+
 	default:
 #ifdef CONFIG_X86_64
 		ideal_nops = k8_nops;
@@ -627,6 +640,7 @@ void __init alternative_instructions(void)
 	apply_paravirt(__parainstructions, __parainstructions_end);
 
 	restart_nmi();
+	alternatives_patched = 1;
 }
 
 /**
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 5caed1dd7ccf..29fa475ec518 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -89,9 +89,7 @@ int amd_cache_northbridges(void)
 			next_northbridge(link, amd_nb_link_ids);
 	}
 
-	/* GART present only on Fam15h upto model 0fh */
-	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
-	    (boot_cpu_data.x86 == 0x15 && boot_cpu_data.x86_model < 0x10))
+	if (amd_gart_present())
 		amd_northbridges.flags |= AMD_NB_GART;
 
 	/*
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 6a7c23ff21d3..ede92c3364d3 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void)
 
 static void apbt_setup_irq(struct apbt_dev *adev)
 {
-	/* timer0 irq has been setup early */
-	if (adev->irq == 0)
-		return;
-
 	irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 76164e173a24..6e85f713641d 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -262,6 +262,9 @@ void __init early_gart_iommu_check(void)
 	u64 aper_base = 0, last_aper_base = 0;
 	int aper_enabled = 0, last_aper_enabled = 0, last_valid = 0;
 
+	if (!amd_gart_present())
+		return;
+
 	if (!early_pci_allowed())
 		return;
 
@@ -355,6 +358,9 @@ int __init gart_iommu_hole_init(void)
 	int fix, slot, valid_agp = 0;
 	int i, node;
 
+	if (!amd_gart_present())
+		return -ENODEV;
+
 	if (gart_iommu_aperture_disabled || !fix_aperture ||
 	    !early_pci_allowed())
 		return -ENODEV;
@@ -452,7 +458,7 @@ out:
 		   force_iommu ||
 		   valid_agp ||
 		   fallback_aper_force) {
-		pr_info("Your BIOS doesn't leave a aperture memory hole\n");
+		pr_info("Your BIOS doesn't leave an aperture memory hole\n");
 		pr_info("Please enable the IOMMU option in the BIOS setup\n");
 		pr_info("This costs you %dMB of RAM\n",
 			32 << fallback_aper_order);
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 816f36e979ad..ae50d3454d78 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Add support of hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,78 +16,112 @@
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/htirq.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/hypertransport.h>
 
+static struct irq_domain *htirq_domain;
+
 /*
  * Hypertransport interrupt support
  */
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-	struct ht_irq_msg msg;
-
-	fetch_ht_irq_msg(irq, &msg);
-
-	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-	write_ht_irq_msg(irq, &msg);
-}
-
 static int
 ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
+	struct irq_data *parent = data->parent_data;
 	int ret;
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
-
-	target_ht_irq(data->irq, dest, cfg->vector);
-	return IRQ_SET_MASK_OK_NOCOPY;
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		struct ht_irq_msg msg;
+		struct irq_cfg *cfg = irqd_cfg(data);
+
+		fetch_ht_irq_msg(data->irq, &msg);
+		msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
+				    HT_IRQ_LOW_DEST_ID_MASK);
+		msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
+				  HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
+		msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+		msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
+		write_ht_irq_msg(data->irq, &msg);
+	}
+
+	return ret;
 }
 
 static struct irq_chip ht_irq_chip = {
 	.name			= "PCI-HT",
 	.irq_mask		= mask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
-	.irq_ack		= apic_ack_edge,
+	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= ht_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
-	struct irq_cfg *cfg;
-	struct ht_irq_msg msg;
-	unsigned dest;
-	int err;
+	struct ht_irq_cfg *ht_cfg;
+	struct irq_alloc_info *info = arg;
+	struct pci_dev *dev;
+	irq_hw_number_t hwirq;
+	int ret;
 
-	if (disable_apic)
-		return -ENXIO;
+	if (nr_irqs > 1 || !info)
+		return -EINVAL;
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	dev = info->ht_dev;
+	hwirq = (info->ht_idx & 0xFF) |
+		PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
+		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
+	if (irq_find_mapping(domain, hwirq) > 0)
+		return -EEXIST;
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
+	if (!ht_cfg)
+		return -ENOMEM;
 
-	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(ht_cfg);
+		return ret;
+	}
+
+	/* Initialize msg to a value that will never match the first write. */
+	ht_cfg->msg.address_lo = 0xffffffff;
+	ht_cfg->msg.address_hi = 0xffffffff;
+	ht_cfg->dev = info->ht_dev;
+	ht_cfg->update = info->ht_update;
+	ht_cfg->pos = info->ht_pos;
+	ht_cfg->idx = 0x10 + (info->ht_idx * 2);
+	irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
+			    handle_edge_irq, ht_cfg, "edge");
+
+	return 0;
+}
+
+static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs)
+{
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
 
+static void htirq_domain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
 	msg.address_lo =
 		HT_IRQ_LOW_BASE |
-		HT_IRQ_LOW_DEST_ID(dest) |
+		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
 		HT_IRQ_LOW_VECTOR(cfg->vector) |
 		((apic->irq_dest_mode == 0) ?
 			HT_IRQ_LOW_DM_PHYSICAL :
@@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 			HT_IRQ_LOW_MT_FIXED :
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	write_ht_irq_msg(irq, &msg);
+static void htirq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
 
-	irq_set_chip_and_handler_name(irq, &ht_irq_chip,
-				      handle_edge_irq, "edge");
+	memset(&msg, 0, sizeof(msg));
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+static const struct irq_domain_ops htirq_domain_ops = {
+	.alloc		= htirq_domain_alloc,
+	.free		= htirq_domain_free,
+	.activate	= htirq_domain_activate,
+	.deactivate	= htirq_domain_deactivate,
+};
 
-	return 0;
+void arch_init_htirq_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
+
+	htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+	if (!htirq_domain)
+		pr_warn("failed to initialize irqdomain for HTIRQ.\n");
+	else
+		htirq_domain->parent = parent;
+}
+
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update)
+{
+	struct irq_alloc_info info;
+
+	if (!htirq_domain)
+		return -ENOSYS;
+
+	init_irq_alloc_info(&info, NULL);
+	info.ht_idx = idx;
+	info.ht_pos = pos;
+	info.ht_dev = dev;
+	info.ht_update = update;
+
+	return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
+				     &info);
+}
+
+void arch_teardown_ht_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f4dc2462a1ac..845dc0df2002 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -18,6 +18,16 @@
  *					and Rolf G. Tews
  *					for testing these extensively
  *	Paul Diefenbaugh	:	Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ *	We used to have a workaround for a bug in SiS chips which
+ *	required to rewrite the index register for a read-modify-write
+ *	operation as the chip lost the index information which was
+ *	setup for the read already. We cache the data now, so that
+ *	workaround has been removed.
  */
 
 #include <linux/mm.h>
@@ -31,13 +41,13 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 
+#include <asm/irqdomain.h>
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -63,27 +73,31 @@
 #define	for_each_ioapic_pin(idx, pin)	\
 	for_each_ioapic((idx))		\
 		for_each_pin((idx), (pin))
-
 #define for_each_irq_pin(entry, head) \
 	list_for_each_entry(entry, &head, list)
 
-/*
- *      Is the SiS APIC rmw bug present ?
- *      -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_MUTEX(ioapic_mutex);
 static unsigned int ioapic_dynirq_base;
 static int ioapic_initialized;
 
-struct mp_pin_info {
+struct irq_pin_list {
+	struct list_head list;
+	int apic, pin;
+};
+
+struct mp_chip_data {
+	struct list_head irq_2_pin;
+	struct IO_APIC_route_entry entry;
 	int trigger;
 	int polarity;
-	int node;
-	int set;
 	u32 count;
+	bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+	u32 gsi_base;
+	u32 gsi_end;
 };
 
 static struct ioapic {
@@ -101,7 +115,6 @@ static struct ioapic {
 	struct mp_ioapic_gsi  gsi_config;
 	struct ioapic_domain_cfg irqdomain_cfg;
 	struct irq_domain *irqdomain;
-	struct mp_pin_info *pin_info;
 	struct resource *iomem_res;
 } ioapics[MAX_IO_APICS];
 
@@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx)
 	return ioapics[ioapic_idx].mp_config.apicaddr;
 }
 
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 {
 	return &ioapics[ioapic_idx].gsi_config;
 }
@@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic)
 	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
 }
 
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
 {
 	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
 }
 
+static inline bool mp_is_legacy_irq(int irq)
+{
+	return irq >= 0 && irq < nr_legacy_irqs();
+}
+
 /*
  * Initialize all legacy IRQs and all pins on the first IOAPIC
  * if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq)
 	if (!nr_legacy_irqs())
 		return 0;
 
-	return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
-	return ioapics[ioapic_idx].pin_info + pin;
+	return ioapic == 0 || mp_is_legacy_irq(irq);
 }
 
 static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m)
 		panic("Max # of irq sources exceeded!!\n");
 }
 
-struct irq_pin_list {
-	struct list_head list;
-	int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
-	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
 static void alloc_ioapic_saved_registers(int idx)
 {
 	size_t size;
@@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx)
 
 int __init arch_early_ioapic_init(void)
 {
-	struct irq_cfg *cfg;
-	int i, node = cpu_to_node(0);
+	int i;
 
 	if (!nr_legacy_irqs())
 		io_apic_irqs = ~0UL;
@@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void)
 	for_each_ioapic(i)
 		alloc_ioapic_saved_registers(i);
 
-	/*
-	 * For legacy IRQ's, start with assigning irq0 to irq15 to
-	 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-	 */
-	for (i = 0; i < nr_legacy_irqs(); i++) {
-		cfg = alloc_irq_and_cfg_at(i, node);
-		cfg->vector = IRQ0_VECTOR + i;
-		cpumask_setall(cfg->domain);
-	}
-
 	return 0;
 }
 
@@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(vector, &io_apic->eoi);
@@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
 	return readl(&io_apic->data);
 }
 
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+			  unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
@@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu
 	writel(value, &io_apic->data);
 }
 
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-
-	if (sis_apic_bug)
-		writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
-
 union entry_union {
 	struct { u32 w1, w2; };
 	struct IO_APIC_route_entry entry;
@@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 static void ioapic_mask_entry(int apic, int pin)
 {
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = 1 };
+	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+				 int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	/* don't allow duplicates */
-	for_each_irq_pin(entry, cfg->irq_2_pin)
+	for_each_irq_pin(entry, data->irq_2_pin)
 		if (entry->apic == apic && entry->pin == pin)
 			return 0;
 
-	entry = alloc_irq_pin_list(node);
+	entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
 	if (!entry) {
 		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
 		       node, apic, pin);
@@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
 	}
 	entry->apic = apic;
 	entry->pin = pin;
+	list_add_tail(&entry->list, &data->irq_2_pin);
 
-	list_add_tail(&entry->list, &cfg->irq_2_pin);
 	return 0;
 }
 
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
 {
 	struct irq_pin_list *tmp, *entry;
 
-	list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+	list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
 		if (entry->apic == apic && entry->pin == pin) {
 			list_del(&entry->list);
 			kfree(entry);
@@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
 		}
 }
 
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+				int node, int apic, int pin)
 {
-	if (__add_pin_to_irq_node(cfg, node, apic, pin))
+	if (__add_pin_to_irq_node(data, node, apic, pin))
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
 
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 					   int oldapic, int oldpin,
 					   int newapic, int newpin)
 {
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 			entry->apic = newapic;
 			entry->pin = newpin;
@@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 	}
 
 	/* old apic/pin didn't exist, so just add new ones */
-	add_pin_to_irq_node(cfg, node, newapic, newpin);
-}
-
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
-				 int mask_and, int mask_or,
-				 void (*final)(struct irq_pin_list *entry))
-{
-	unsigned int reg, pin;
-
-	pin = entry->pin;
-	reg = io_apic_read(entry->apic, 0x10 + pin * 2);
-	reg &= mask_and;
-	reg |= mask_or;
-	io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
-	if (final)
-		final(entry);
+	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
 			       int mask_and, int mask_or,
 			       void (*final)(struct irq_pin_list *entry))
 {
+	union entry_union eu;
 	struct irq_pin_list *entry;
 
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		__io_apic_modify_irq(entry, mask_and, mask_or, final);
+	eu.entry = data->entry;
+	eu.w1 &= mask_and;
+	eu.w1 |= mask_or;
+	data->entry = eu.entry;
+
+	for_each_irq_pin(entry, data->irq_2_pin) {
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		if (final)
+			final(entry);
+	}
 }
 
 static void io_apic_sync(struct irq_pin_list *entry)
@@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry)
 	readl(&io_apic->data);
 }
 
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
 {
-	mask_ioapic(irqd_cfg(data));
+	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
-	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_ioapic(cfg);
+	__unmask_ioapic(data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_ioapic_irq(struct irq_data *data)
-{
-	unmask_ioapic(irqd_cfg(data));
-}
-
 /*
  * IO-APIC versions below 0x20 don't support EOI register.
  * For the record, here is the information about various versions:
@@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data)
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
  */
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
 {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
 		io_apic_eoi(apic, vector);
@@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 */
-		entry1.mask = 1;
+		entry1.mask = IOAPIC_MASKED;
 		entry1.trigger = IOAPIC_EDGE;
 
 		__ioapic_write_entry(apic, pin, entry1);
@@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 	}
 }
 
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
 {
-	struct irq_pin_list *entry;
 	unsigned long flags;
+	struct irq_pin_list *entry;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
-					       cfg->vector);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__eoi_ioapic_pin(entry->apic, entry->pin, vector);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
-	if (!entry.mask) {
-		entry.mask = 1;
+	if (entry.mask == IOAPIC_UNMASKED) {
+		entry.mask = IOAPIC_MASKED;
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 	}
@@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 */
-		if (!entry.trigger) {
+		if (entry.trigger == IOAPIC_EDGE) {
 			entry.trigger = IOAPIC_LEVEL;
 			ioapic_write_entry(apic, pin, entry);
 		}
-
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+		__eoi_ioapic_pin(apic, pin, entry.vector);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 
@@ -706,8 +670,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 
 			entry = ioapics[apic].saved_registers[pin];
-			if (!entry.mask) {
-				entry.mask = 1;
+			if (entry.mask == IOAPIC_UNMASKED) {
+				entry.mask = IOAPIC_MASKED;
 				ioapic_write_entry(apic, pin, entry);
 			}
 		}
@@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq)
 
 #endif
 
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
  * when listed as conforming in the MP table. */
 
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
+#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
+#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
 
 /* EISA interrupts are always polarity zero and can be edge or level
  * trigger depending on the ELCR value.  If an interrupt is listed as
@@ -823,53 +787,55 @@ static int EISA_ELCR(unsigned int irq)
 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
  * when listed as conforming in the MP table. */
 
-#define default_PCI_trigger(idx)	(1)
-#define default_PCI_polarity(idx)	(1)
+#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
+#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
 
 static int irq_polarity(int idx)
 {
 	int bus = mp_irqs[idx].srcbus;
-	int polarity;
 
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 */
-	switch (mp_irqs[idx].irqflag & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-			if (test_bit(bus, mp_bus_not_pci))
-				polarity = default_ISA_polarity(idx);
-			else
-				polarity = default_PCI_polarity(idx);
-			break;
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+	switch (mp_irqs[idx].irqflag & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent polarity */
+		if (test_bit(bus, mp_bus_not_pci))
+			return default_ISA_polarity(idx);
+		else
+			return default_PCI_polarity(idx);
+	case 1:
+		return IOAPIC_POL_HIGH;
+	case 2:
+		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_POL_LOW;
+	}
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	switch (mp_bus_id_to_type[bus]) {
+	case MP_BUS_PCI:
+	case MP_BUS_ISA:
+		return trigger;
+	case MP_BUS_EISA:
+		return default_EISA_trigger(idx);
 	}
-	return polarity;
+	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+	return IOAPIC_LEVEL;
 }
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	return trigger;
+}
+#endif
 
 static int irq_trigger(int idx)
 {
@@ -879,153 +845,227 @@ static int irq_trigger(int idx)
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
-	switch ((mp_irqs[idx].irqflag>>2) & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent */
-			if (test_bit(bus, mp_bus_not_pci))
-				trigger = default_ISA_trigger(idx);
-			else
-				trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
-			switch (mp_bus_id_to_type[bus]) {
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				default:
-				{
-					pr_warn("broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
+	switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent trigger mode */
+		if (test_bit(bus, mp_bus_not_pci))
+			trigger = default_ISA_trigger(idx);
+		else
+			trigger = default_PCI_trigger(idx);
+		/* Take EISA into account */
+		return eisa_irq_trigger(idx, bus, trigger);
+	case 1:
+		return IOAPIC_EDGE;
+	case 2:
+		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_LEVEL;
+	}
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+			   int trigger, int polarity)
+{
+	init_irq_alloc_info(info, NULL);
+	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info->ioapic_node = node;
+	info->ioapic_trigger = trigger;
+	info->ioapic_polarity = polarity;
+	info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #endif
-			break;
-		case 1: /* edge */
-		{
-			trigger = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 1;
-			break;
-		}
-		case 3: /* level */
-		{
-			trigger = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 0;
-			break;
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+				   struct irq_alloc_info *src,
+				   u32 gsi, int ioapic_idx, int pin)
+{
+	int trigger, polarity;
+
+	copy_irq_alloc_info(dst, src);
+	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic_pin = pin;
+	dst->ioapic_valid = 1;
+	if (src && src->ioapic_valid) {
+		dst->ioapic_node = src->ioapic_node;
+		dst->ioapic_trigger = src->ioapic_trigger;
+		dst->ioapic_polarity = src->ioapic_polarity;
+	} else {
+		dst->ioapic_node = NUMA_NO_NODE;
+		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+			dst->ioapic_trigger = trigger;
+			dst->ioapic_polarity = polarity;
+		} else {
+			/*
+			 * PCI interrupts are always active low level
+			 * triggered.
+			 */
+			dst->ioapic_trigger = IOAPIC_LEVEL;
+			dst->ioapic_polarity = IOAPIC_POL_LOW;
 		}
 	}
-	return trigger;
 }
 
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+	irq_flow_handler_t hdl;
+	bool fasteoi;
+
+	if (trigger) {
+		irq_set_status_flags(irq, IRQ_LEVEL);
+		fasteoi = true;
+	} else {
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
+
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	__irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 {
+	struct mp_chip_data *data = irq_get_chip_data(irq);
+
+	/*
+	 * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+	 * and polarity attirbutes. So allow the first user to reprogram the
+	 * pin with real trigger and polarity attributes.
+	 */
+	if (irq < nr_legacy_irqs() && data->count == 1) {
+		if (info->ioapic_trigger != data->trigger)
+			mp_register_handler(irq, data->trigger);
+		data->entry.trigger = data->trigger = info->ioapic_trigger;
+		data->entry.polarity = data->polarity = info->ioapic_polarity;
+	}
+
+	return data->trigger == info->ioapic_trigger &&
+	       data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+				 struct irq_alloc_info *info)
+{
+	bool legacy = false;
 	int irq = -1;
-	int ioapic = (int)(long)domain->host_data;
 	int type = ioapics[ioapic].irqdomain_cfg.type;
 
 	switch (type) {
 	case IOAPIC_DOMAIN_LEGACY:
 		/*
-		 * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
-		 * GSIs on some weird platforms.
+		 * Dynamically allocate IRQ number for non-ISA IRQs in the first
+		 * 16 GSIs on some weird platforms.
 		 */
-		if (gsi < nr_legacy_irqs())
-			irq = irq_create_mapping(domain, pin);
-		else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+		if (!ioapic_initialized || gsi >= nr_legacy_irqs())
 			irq = gsi;
+		legacy = mp_is_legacy_irq(irq);
 		break;
 	case IOAPIC_DOMAIN_STRICT:
-		if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
-			irq = gsi;
+		irq = gsi;
 		break;
 	case IOAPIC_DOMAIN_DYNAMIC:
-		irq = irq_create_mapping(domain, pin);
 		break;
 	default:
 		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
-		break;
+		return -1;
+	}
+
+	return __irq_domain_alloc_irqs(domain, irq, 1,
+				       ioapic_alloc_attr_node(info),
+				       info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+				     int irq, int ioapic, int pin,
+				     struct irq_alloc_info *info)
+{
+	struct mp_chip_data *data;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	int node = ioapic_alloc_attr_node(info);
+
+	/*
+	 * Legacy ISA IRQ has already been allocated, just add pin to
+	 * the pin list assoicated with this IRQ and program the IOAPIC
+	 * entry. The IOAPIC entry
+	 */
+	if (irq_data && irq_data->parent_data) {
+		if (!mp_check_pin_attr(irq, info))
+			return -EBUSY;
+		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+					  info->ioapic_pin))
+			return -ENOMEM;
+	} else {
+		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+		if (irq >= 0) {
+			irq_data = irq_domain_get_irq_data(domain, irq);
+			data = irq_data->chip_data;
+			data->isa_irq = true;
+		}
 	}
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
-			     unsigned int flags)
+			     unsigned int flags, struct irq_alloc_info *info)
 {
 	int irq;
+	bool legacy = false;
+	struct irq_alloc_info tmp;
+	struct mp_chip_data *data;
 	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
-	struct mp_pin_info *info = mp_pin_info(ioapic, pin);
 
 	if (!domain)
-		return -1;
+		return -ENOSYS;
 
-	mutex_lock(&ioapic_mutex);
-
-	/*
-	 * Don't use irqdomain to manage ISA IRQs because there may be
-	 * multiple IOAPIC pins sharing the same ISA IRQ number and
-	 * irqdomain only supports 1:1 mapping between IOAPIC pin and
-	 * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
-	 * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
-	 * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
-	 * available, and some BIOSes may use MP Interrupt Source records
-	 * to override IRQ numbers for PIRQs instead of reprogramming
-	 * the interrupt routing logic. Thus there may be multiple pins
-	 * sharing the same legacy IRQ number when ACPI is disabled.
-	 */
 	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
-		if (flags & IOAPIC_MAP_ALLOC) {
-			if (info->count == 0 &&
-			    mp_irqdomain_map(domain, irq, pin) != 0)
-				irq = -1;
+		legacy = mp_is_legacy_irq(irq);
+	}
 
-			/* special handling for timer IRQ0 */
+	mutex_lock(&ioapic_mutex);
+	if (!(flags & IOAPIC_MAP_ALLOC)) {
+		if (!legacy) {
+			irq = irq_find_mapping(domain, pin);
 			if (irq == 0)
-				info->count++;
+				irq = -ENOENT;
 		}
 	} else {
-		irq = irq_find_mapping(domain, pin);
-		if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
-			irq = alloc_irq_from_domain(domain, gsi, pin);
-	}
-
-	if (flags & IOAPIC_MAP_ALLOC) {
-		/* special handling for legacy IRQs */
-		if (irq < nr_legacy_irqs() && info->count == 1 &&
-		    mp_irqdomain_map(domain, irq, pin) != 0)
-			irq = -1;
-
-		if (irq > 0)
-			info->count++;
-		else if (info->count == 0)
-			info->set = 0;
+		ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+		if (legacy)
+			irq = alloc_isa_irq_from_domain(domain, irq,
+							ioapic, pin, &tmp);
+		else if ((irq = irq_find_mapping(domain, pin)) == 0)
+			irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+		else if (!mp_check_pin_attr(irq, &tmp))
+			irq = -EBUSY;
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			data->count++;
+		}
 	}
-
 	mutex_unlock(&ioapic_mutex);
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 
 static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
 	}
 #endif
 
-	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
 }
 
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 {
 	int ioapic, pin, idx;
 
@@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
 		return -1;
 
-	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
 
 void mp_unmap_irq(int irq)
 {
-	struct irq_data *data = irq_get_irq_data(irq);
-	struct mp_pin_info *info;
-	int ioapic, pin;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	struct mp_chip_data *data;
 
-	if (!data || !data->domain)
+	if (!irq_data || !irq_data->domain)
 		return;
 
-	ioapic = (int)(long)data->domain->host_data;
-	pin = (int)data->hwirq;
-	info = mp_pin_info(ioapic, pin);
+	data = irq_data->chip_data;
+	if (!data || data->isa_irq)
+		return;
 
 	mutex_lock(&ioapic_mutex);
-	if (--info->count == 0) {
-		info->set = 0;
-		if (irq < nr_legacy_irqs() &&
-		    ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
-			mp_irqdomain_unmap(data->domain, irq);
-		else
-			irq_dispose_mapping(irq);
-	}
+	if (--data->count == 0)
+		irq_domain_free_irqs(irq, 1);
 	mutex_unlock(&ioapic_mutex);
 }
 
@@ -1165,7 +1198,7 @@ out:
 }
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
 
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
@@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
-				 unsigned long trigger)
-{
-	struct irq_chip *chip = &ioapic_chip;
-	irq_flow_handler_t hdl;
-	bool fasteoi;
-
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL) {
-		irq_set_status_flags(irq, IRQ_LEVEL);
-		fasteoi = true;
-	} else {
-		irq_clear_status_flags(irq, IRQ_LEVEL);
-		fasteoi = false;
-	}
-
-	if (setup_remapped_irq(irq, cfg, chip))
-		fasteoi = trigger != 0;
-
-	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
-	irq_set_chip_and_handler_name(irq, chip, hdl,
-				      fasteoi ? "fasteoi" : "edge");
-}
-
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
-{
-	memset(entry, 0, sizeof(*entry));
-
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = destination;
-	entry->vector	     = vector;
-	entry->mask	     = 0;			/* enable IRQ */
-	entry->trigger	     = attr->trigger;
-	entry->polarity	     = attr->polarity;
-
-	/*
-	 * Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
-				struct io_apic_irq_attr *attr)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	if (!IO_APIC_IRQ(irq))
-		return;
-
-	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
-		return;
-
-	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
-					 &dest)) {
-		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
-		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
-	if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
-		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	ioapic_register_intr(irq, cfg, attr->trigger);
-	if (irq < nr_legacy_irqs())
-		legacy_pic->mask(irq);
-
-	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
-
 static void __init setup_IO_APIC_irqs(void)
 {
 	unsigned int ioapic, pin;
@@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void)
 	}
 }
 
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
-					unsigned int pin, int vector)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	memset(&entry, 0, sizeof(entry));
-
-	/*
-	 * We use logical delivery to get the timer IRQ
-	 * to the first CPU.
-	 */
-	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
-						  apic->target_cpus(), &dest)))
-		dest = BAD_APICID;
-
-	entry.dest_mode = apic->irq_dest_mode;
-	entry.mask = 0;			/* don't mask IRQ for edge */
-	entry.dest = dest;
-	entry.delivery_mode = apic->irq_delivery_mode;
-	entry.polarity = 0;
-	entry.trigger = 0;
-	entry.vector = vector;
-
-	/*
-	 * The timer IRQ doesn't have to know that behind the
-	 * scene we may have a 8259A-master in AEOI mode ...
-	 */
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
-
-	/*
-	 * Add it to the IO-APIC irq-routing table:
-	 */
-	ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
 {
-	int i;
-
-	pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
-	for (i = 0; i <= nr_entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		pr_debug(" %02x %02X  ", i, entry.dest);
-		pr_cont("%1d    %1d    %1d   %1d   %1d    "
-			"%1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector);
-	}
+	raw_spin_lock_init(&ioapic_lock);
 }
 
-void intel_ir_io_apic_print_entries(unsigned int apic,
-				    unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
 	int i;
+	char buf[256];
+	struct IO_APIC_route_entry entry;
+	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
 
-	pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
-		struct IR_IO_APIC_route_entry *ir_entry;
-		struct IO_APIC_route_entry entry;
-
 		entry = ioapic_read_entry(apic, i);
-
-		ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
-		pr_debug(" %02x %04X ", i, ir_entry->index);
-		pr_cont("%1d   %1d    %1d    %1d   %1d   "
-			"%1d    %1d     %X    %02X\n",
-			ir_entry->format,
-			ir_entry->mask,
-			ir_entry->trigger,
-			ir_entry->irr,
-			ir_entry->polarity,
-			ir_entry->delivery_status,
-			ir_entry->index2,
-			ir_entry->zero,
-			ir_entry->vector);
+		snprintf(buf, sizeof(buf),
+			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+			 i,
+			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.vector, entry.irr, entry.delivery_status);
+		if (ir_entry->format)
+			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
+			       buf, (ir_entry->index << 15) | ir_entry->index,
+			       ir_entry->zero);
+		else
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+			       buf,
+			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+			       "logical " : "physical",
+			       entry.dest, entry.delivery_mode);
 	}
 }
 
-void ioapic_zap_locks(void)
-{
-	raw_spin_lock_init(&ioapic_lock);
-}
-
 static void __init print_IO_APIC(int ioapic_idx)
 {
 	union IO_APIC_reg_00 reg_00;
@@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx)
 	}
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-	x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+	io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
 }
 
 void __init print_IO_APICs(void)
 {
 	int ioapic_idx;
-	struct irq_cfg *cfg;
 	unsigned int irq;
-	struct irq_chip *chip;
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for_each_ioapic(ioapic_idx)
@@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void)
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
+		struct irq_chip *chip;
+		struct mp_chip_data *data;
 
 		chip = irq_get_chip(irq);
-		if (chip != &ioapic_chip)
+		if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
 			continue;
-
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = irq_get_chip_data(irq);
+		if (!data)
 			continue;
-		if (list_empty(&cfg->irq_2_pin))
+		if (list_empty(&data->irq_2_pin))
 			continue;
+
 		printk(KERN_DEBUG "IRQ%d ", irq);
-		for_each_irq_pin(entry, cfg->irq_2_pin)
+		for_each_irq_pin(entry, data->irq_2_pin)
 			pr_cont("-> %d:%d", entry->apic, entry->pin);
 		pr_cont("\n");
 	}
@@ -1564,15 +1441,12 @@ void native_disable_io_apic(void)
 		struct IO_APIC_route_entry entry;
 
 		memset(&entry, 0, sizeof(entry));
-		entry.mask            = 0; /* Enabled */
-		entry.trigger         = 0; /* Edge */
-		entry.irr             = 0;
-		entry.polarity        = 0; /* High */
-		entry.delivery_status = 0;
-		entry.dest_mode       = 0; /* Physical */
-		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-		entry.vector          = 0;
-		entry.dest            = read_apic_id();
+		entry.mask		= IOAPIC_UNMASKED;
+		entry.trigger		= IOAPIC_EDGE;
+		entry.polarity		= IOAPIC_POL_HIGH;
+		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
+		entry.delivery_mode	= dest_ExtINT;
+		entry.dest		= read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1582,7 +1456,6 @@ void native_disable_io_apic(void)
 
 	if (cpu_has_apic || apic_from_smp_config())
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
 }
 
 /*
@@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void)
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
  */
-
 static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
 	int was_pending = 0, irq = data->irq;
@@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 	}
-	__unmask_ioapic(irqd_cfg(data));
+	__unmask_ioapic(data->chip_data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return was_pending;
 }
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-
-		apic = entry->apic;
-		pin = entry->pin;
-
-		io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-	}
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
-			       const struct cpumask *mask,
-			       bool force)
-{
-	unsigned int dest, irq = data->irq;
-	unsigned long flags;
-	int ret;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -EPERM;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = apic_set_affinity(data, mask, &dest);
-	if (!ret) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, irqd_cfg(data));
-		ret = IRQ_SET_MASK_OK_NOCOPY;
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	return ret;
-}
-
 atomic_t irq_mis_count;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		unsigned int reg;
 		int pin;
 
@@ -1888,18 +1708,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 	return false;
 }
 
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		mask_ioapic(cfg);
+		mask_ioapic_irq(data);
 		return true;
 	}
 	return false;
 }
 
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 	if (unlikely(masked)) {
 		/* Only migrate the irq if the ack has been received.
@@ -1928,31 +1747,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data,
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 */
-		if (!io_apic_level_ack_pending(cfg))
+		if (!io_apic_level_ack_pending(data->chip_data))
 			irq_move_masked_irq(data);
-		unmask_ioapic(cfg);
+		unmask_ioapic_irq(data);
 	}
 }
 #else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 	return false;
 }
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 }
 #endif
 
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	int i, irq = data->irq;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	unsigned long v;
 	bool masked;
+	int i;
 
 	irq_complete_move(cfg);
-	masked = ioapic_irqd_mask(data, cfg);
+	masked = ioapic_irqd_mask(irq_data);
 
 	/*
 	 * It appears there is an erratum which affects at least version 0x11
@@ -2004,11 +1822,49 @@ static void ack_ioapic_level(struct irq_data *data)
 	 */
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
+		eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+	}
+
+	ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+	struct mp_chip_data *data = irq_data->chip_data;
+
+	/*
+	 * Intr-remapping uses pin number as the virtual vector
+	 * in the RTE. Actual vector is programmed in
+	 * intr-remapping table entry. Hence for the io-apic
+	 * EOI we use the pin number.
+	 */
+	ack_APIC_irq();
+	eoi_ioapic_pin(data->entry.vector, data);
+}
 
-		eoi_ioapic_irq(irq, cfg);
+static int ioapic_set_affinity(struct irq_data *irq_data,
+			       const struct cpumask *mask, bool force)
+{
+	struct irq_data *parent = irq_data->parent_data;
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_pin_list *entry;
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+		cfg = irqd_cfg(irq_data);
+		data->entry.dest = cfg->dest_apicid;
+		data->entry.vector = cfg->vector;
+		for_each_irq_pin(entry, data->irq_2_pin)
+			__ioapic_write_entry(entry->apic, entry->pin,
+					     data->entry);
 	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	ioapic_irqd_unmask(data, cfg, masked);
+	return ret;
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
@@ -2016,10 +1872,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_startup		= startup_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_eoi		= ack_ioapic_level,
-	.irq_set_affinity	= native_ioapic_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+	.name			= "IR-IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ir_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
@@ -2113,12 +1979,12 @@ static inline void __init unlock_ExtINT_logic(void)
 
 	memset(&entry1, 0, sizeof(entry1));
 
-	entry1.dest_mode = 0;			/* physical delivery */
-	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+	entry1.mask = IOAPIC_UNMASKED;
 	entry1.dest = hard_smp_processor_id();
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
-	entry1.trigger = 0;
+	entry1.trigger = IOAPIC_EDGE;
 	entry1.vector = 0;
 
 	ioapic_write_entry(apic, pin, entry1);
@@ -2152,6 +2018,25 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+	int irq = -1;
+	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+	if (domain) {
+		struct irq_alloc_info info;
+
+		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+		info.ioapic_id = mpc_ioapic_id(ioapic);
+		info.ioapic_pin = pin;
+		mutex_lock(&ioapic_mutex);
+		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+		mutex_unlock(&ioapic_mutex);
+	}
+
+	return irq;
+}
+
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2162,7 +2047,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup);
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_data *irq_data = irq_get_irq_data(0);
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
@@ -2174,7 +2061,6 @@ static inline void __init check_timer(void)
 	 * get/set the timer IRQ vector:
 	 */
 	legacy_pic->mask(0);
-	assign_irq_vector(0, cfg, apic->target_cpus());
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2215,23 +2101,21 @@ static inline void __init check_timer(void)
 	}
 
 	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
+		/* Ok, does IRQ0 through the IOAPIC work? */
 		if (no_pin1) {
-			add_pin_to_irq_node(cfg, node, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+			mp_alloc_timer_irq(apic1, pin1);
 		} else {
-			/* for edge trigger, setup_ioapic_irq already
-			 * leave it unmasked.
+			/*
+			 * for edge trigger, it's already unmasked,
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 */
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
-				unmask_ioapic(cfg);
+				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
+		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -2251,8 +2135,8 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
-		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2329,36 +2213,35 @@ out:
 
 static int mp_irqdomain_create(int ioapic)
 {
-	size_t size;
+	struct irq_alloc_info info;
+	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 
-	size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
-	ip->pin_info = kzalloc(size, GFP_KERNEL);
-	if (!ip->pin_info)
-		return -ENOMEM;
-
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info.ioapic_id = mpc_ioapic_id(ioapic);
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (!parent)
+		parent = x86_vector_domain;
+
 	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
 					      (void *)(long)ioapic);
-	if(!ip->irqdomain) {
-		kfree(ip->pin_info);
-		ip->pin_info = NULL;
+	if (!ip->irqdomain)
 		return -ENOMEM;
-	}
+
+	ip->irqdomain->parent = parent;
 
 	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
 	    cfg->type == IOAPIC_DOMAIN_STRICT)
 		ioapic_dynirq_base = max(ioapic_dynirq_base,
 					 gsi_cfg->gsi_end + 1);
 
-	if (gsi_cfg->gsi_base == 0)
-		irq_set_default_host(ip->irqdomain);
-
 	return 0;
 }
 
@@ -2368,8 +2251,6 @@ static void ioapic_destroy_irqdomain(int idx)
 		irq_domain_remove(ioapics[idx].irqdomain);
 		ioapics[idx].irqdomain = NULL;
 	}
-	kfree(ioapics[idx].pin_info);
-	ioapics[idx].pin_info = NULL;
 }
 
 void __init setup_IO_APIC(void)
@@ -2399,20 +2280,6 @@ void __init setup_IO_APIC(void)
 	ioapic_initialized = 1;
 }
 
-/*
- *      Called after all the initialization is done. If we didn't find any
- *      APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
-	if (sis_apic_bug == -1)
-		sis_apic_bug = 0;
-	return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
 static void resume_ioapic_id(int ioapic_idx)
 {
 	unsigned long flags;
@@ -2451,20 +2318,6 @@ static int __init ioapic_init_ops(void)
 
 device_initcall(ioapic_init_ops);
 
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
-	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
-	int ret;
-
-	if (!cfg)
-		return -EINVAL;
-	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
-	if (!ret)
-		setup_ioapic_irq(irq, cfg, attr);
-	return ret;
-}
-
 static int io_apic_get_redir_entries(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
@@ -2692,7 +2545,7 @@ void __init setup_ioapic_dest(void)
 		else
 			mask = apic->target_cpus();
 
-		x86_io_apic_ops.set_affinity(idata, mask, false);
+		irq_set_affinity(irq, mask);
 	}
 
 }
@@ -2737,7 +2590,7 @@ static struct resource * __init ioapic_setup_resources(void)
 	return res;
 }
 
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
@@ -2962,7 +2815,6 @@ int mp_unregister_ioapic(u32 gsi_base)
 {
 	int ioapic, pin;
 	int found = 0;
-	struct mp_pin_info *pin_info;
 
 	for_each_ioapic(ioapic)
 		if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -2975,11 +2827,17 @@ int mp_unregister_ioapic(u32 gsi_base)
 	}
 
 	for_each_pin(ioapic, pin) {
-		pin_info = mp_pin_info(ioapic, pin);
-		if (pin_info->count) {
-			pr_warn("pin%d on IOAPIC%d is still in use.\n",
-				pin, ioapic);
-			return -EBUSY;
+		u32 gsi = mp_pin_to_gsi(ioapic, pin);
+		int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+		struct mp_chip_data *data;
+
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			if (data && data->count) {
+				pr_warn("pin%d on IOAPIC%d is still in use.\n",
+					pin, ioapic);
+				return -EBUSY;
+			}
 		}
 	}
 
@@ -3006,108 +2864,141 @@ int mp_ioapic_registered(u32 gsi_base)
 	return 0;
 }
 
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
-					int ioapic, int ioapic_pin,
-					int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+				  struct irq_alloc_info *info)
 {
-	irq_attr->ioapic	= ioapic;
-	irq_attr->ioapic_pin	= ioapic_pin;
-	irq_attr->trigger	= trigger;
-	irq_attr->polarity	= polarity;
+	if (info && info->ioapic_valid) {
+		data->trigger = info->ioapic_trigger;
+		data->polarity = info->ioapic_polarity;
+	} else if (acpi_get_override_irq(gsi, &data->trigger,
+					 &data->polarity) < 0) {
+		/* PCI interrupts are always active low level triggered. */
+		data->trigger = IOAPIC_LEVEL;
+		data->polarity = IOAPIC_POL_LOW;
+	}
 }
 
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-		     irq_hw_number_t hwirq)
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+			   struct IO_APIC_route_entry *entry)
 {
-	int ioapic = (int)(long)domain->host_data;
-	struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
-	struct io_apic_irq_attr attr;
+	memset(entry, 0, sizeof(*entry));
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest	     = cfg->dest_apicid;
+	entry->vector	     = cfg->vector;
+	entry->trigger	     = data->trigger;
+	entry->polarity	     = data->polarity;
+	/*
+	 * Mask level triggered irqs. Edge triggered irqs are masked
+	 * by the irq core code in case they fire.
+	 */
+	if (data->trigger == IOAPIC_LEVEL)
+		entry->mask = IOAPIC_MASKED;
+	else
+		entry->mask = IOAPIC_UNMASKED;
+}
 
-	/* Get default attribute if not set by caller yet */
-	if (!info->set) {
-		u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs, void *arg)
+{
+	int ret, ioapic, pin;
+	struct irq_cfg *cfg;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
+	struct irq_alloc_info *info = arg;
 
-		if (acpi_get_override_irq(gsi, &info->trigger,
-					  &info->polarity) < 0) {
-			/*
-			 * PCI interrupts are always polarity one level
-			 * triggered.
-			 */
-			info->trigger = 1;
-			info->polarity = 1;
-		}
-		info->node = NUMA_NO_NODE;
+	if (!info || nr_irqs > 1)
+		return -EINVAL;
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (!irq_data)
+		return -EINVAL;
 
-		/*
-		 * setup_IO_APIC_irqs() programs all legacy IRQs with default
-		 * trigger and polarity attributes. Don't set the flag for that
-		 * case so the first legacy IRQ user could reprogram the pin
-		 * with real trigger and polarity attributes.
-		 */
-		if (virq >= nr_legacy_irqs() || info->count)
-			info->set = 1;
-	}
-	set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
-			     info->polarity);
+	ioapic = mp_irqdomain_ioapic_idx(domain);
+	pin = info->ioapic_pin;
+	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+		return -EEXIST;
 
-	return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
 
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
-	struct irq_data *data = irq_get_irq_data(virq);
-	struct irq_cfg *cfg = irq_cfg(virq);
-	int ioapic = (int)(long)domain->host_data;
-	int pin = (int)data->hwirq;
+	info->ioapic_entry = &data->entry;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(data);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&data->irq_2_pin);
+	irq_data->hwirq = info->ioapic_pin;
+	irq_data->chip = (domain->parent == x86_vector_domain) ?
+			  &ioapic_chip : &ioapic_ir_chip;
+	irq_data->chip_data = data;
+	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+	cfg = irqd_cfg(irq_data);
+	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+	if (info->ioapic_entry)
+		mp_setup_entry(cfg, data, info->ioapic_entry);
+	mp_register_handler(virq, data->trigger);
+	if (virq < nr_legacy_irqs())
+		legacy_pic->mask(virq);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+		    virq, data->trigger, data->polarity, cfg->dest_apicid);
 
-	ioapic_mask_entry(ioapic, pin);
-	__remove_pin_from_irq(cfg, ioapic, pin);
-	WARN_ON(!list_empty(&cfg->irq_2_pin));
-	arch_teardown_hwirq(virq);
+	return 0;
 }
 
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs)
 {
-	int ret = 0;
-	int ioapic, pin;
-	struct mp_pin_info *info;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -ENODEV;
-
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	info = mp_pin_info(ioapic, pin);
-	trigger = trigger ? 1 : 0;
-	polarity = polarity ? 1 : 0;
-
-	mutex_lock(&ioapic_mutex);
-	if (!info->set) {
-		info->trigger = trigger;
-		info->polarity = polarity;
-		info->node = node;
-		info->set = 1;
-	} else if (info->trigger != trigger || info->polarity != polarity) {
-		ret = -EBUSY;
+	BUG_ON(nr_irqs != 1);
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (irq_data && irq_data->chip_data) {
+		data = irq_data->chip_data;
+		__remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+				      (int)irq_data->hwirq);
+		WARN_ON(!list_empty(&data->irq_2_pin));
+		kfree(irq_data->chip_data);
 	}
-	mutex_unlock(&ioapic_mutex);
-
-	return ret;
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+			   struct irq_data *irq_data)
 {
-	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+	unsigned long flags;
+	struct irq_pin_list *entry;
+	struct mp_chip_data *data = irq_data->chip_data;
 
-	printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
-	physid_set_mask_of_physid(boot_cpu_physical_apicid,
-					 &phys_cpu_present_map);
-#endif
-	setup_local_APIC();
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
-	io_apic_setup_irq_pin(0, 0, &attr);
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+			     struct irq_data *irq_data)
+{
+	/* It won't be called for IRQ with multiple IOAPIC pins associated */
+	ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+			  (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+	return (int)(long)domain->host_data;
 }
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+	.alloc		= mp_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
+};
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index d6ba2d660dc5..1a9d735e09c6 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Convert to hierarchical irqdomain
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -14,22 +16,23 @@
 #include <linux/dmar.h>
 #include <linux/hpet.h>
 #include <linux/msi.h>
+#include <asm/irqdomain.h>
 #include <asm/msidef.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 
-void native_compose_msi_msg(struct pci_dev *pdev,
-			    unsigned int irq, unsigned int dest,
-			    struct msi_msg *msg, u8 hpet_id)
+static struct irq_domain *msi_default_domain;
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_cfg *cfg = irqd_cfg(data);
 
 	msg->address_hi = MSI_ADDR_BASE_HI;
 
 	if (x2apic_enabled())
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
 
 	msg->address_lo =
 		MSI_ADDR_BASE_LO |
@@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		((apic->irq_delivery_mode != dest_LowestPrio) ?
 			MSI_ADDR_REDIRECTION_CPU :
 			MSI_ADDR_REDIRECTION_LOWPRI) |
-		MSI_ADDR_DEST_ID(dest);
+		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
@@ -50,180 +53,201 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		MSI_DATA_VECTOR(cfg->vector);
 }
 
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
-			   struct msi_msg *msg, u8 hpet_id)
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+	.name			= "PCI-MSI",
+	.irq_unmask		= pci_msi_unmask_irq,
+	.irq_mask		= pci_msi_mask_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
-	struct irq_cfg *cfg;
-	int err;
-	unsigned dest;
+	struct irq_domain *domain;
+	struct irq_alloc_info info;
 
-	if (disable_apic)
-		return -ENXIO;
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_MSI;
+	info.msi_dev = dev;
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	domain = irq_remapping_get_irq_domain(&info);
+	if (domain == NULL)
+		domain = msi_default_domain;
+	if (domain == NULL)
+		return -ENOSYS;
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+}
 
-	x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+void native_teardown_msi_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
+}
 
-	return 0;
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+					 msi_alloc_info_t *arg)
+{
+	return arg->msi_hwirq;
 }
 
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *arg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+	init_irq_alloc_info(arg, NULL);
+	arg->msi_dev = pdev;
+	if (desc->msi_attrib.is_msix) {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+	} else {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+	}
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	return 0;
+}
 
-	__get_cached_msi_msg(data->msi_desc, &msg);
+static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+	.get_hwirq	= pci_msi_get_hwirq,
+	.msi_prepare	= pci_msi_prepare,
+	.set_desc	= pci_msi_set_desc,
+};
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static struct msi_domain_info pci_msi_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
-	__pci_write_msi_msg(data->msi_desc, &msg);
+void arch_init_msi_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	msi_default_domain = pci_msi_create_irq_domain(NULL,
+					&pci_msi_domain_info, parent);
+	if (!msi_default_domain)
+		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
 }
 
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-	.name			= "PCI-MSI",
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+	.name			= "IR-PCI-MSI",
 	.irq_unmask		= pci_msi_unmask_irq,
 	.irq_mask		= pci_msi_mask_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_set_vcpu_affinity	= irq_chip_set_vcpu_affinity_parent,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset)
-{
-	struct irq_chip *chip = &msi_chip;
-	struct msi_msg msg;
-	unsigned int irq = irq_base + irq_offset;
-	int ret;
-
-	ret = msi_compose_msg(dev, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-
-	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
-
-	/*
-	 * MSI-X message is written per-IRQ, the offset is always 0.
-	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
-	 */
-	if (!irq_offset)
-		pci_write_msi_msg(irq, &msg);
+static struct msi_domain_info pci_msi_ir_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_ir_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+{
+	return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+}
+#endif
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	dmar_msi_write(data->irq, msg);
+}
 
-	dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq);
+static struct irq_chip dmar_msi_controller = {
+	.name			= "DMAR-MSI",
+	.irq_unmask		= dmar_msi_unmask,
+	.irq_mask		= dmar_msi_mask,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_set_affinity	= msi_domain_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.irq_write_msi_msg	= dmar_msi_write_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
 
-	return 0;
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->dmar_id;
 }
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int dmar_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
-	struct msi_desc *msidesc;
-	unsigned int irq;
-	int node, ret;
+	irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+			    handle_edge_irq, arg->dmar_data, "edge");
 
-	/* Multiple MSI vectors only supported with interrupt remapping */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
+	return 0;
+}
 
-	node = dev_to_node(&dev->dev);
+static struct msi_domain_ops dmar_msi_domain_ops = {
+	.get_hwirq	= dmar_msi_get_hwirq,
+	.msi_init	= dmar_msi_init,
+};
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = irq_alloc_hwirq(node);
-		if (!irq)
-			return -ENOSPC;
+static struct msi_domain_info dmar_msi_domain_info = {
+	.ops		= &dmar_msi_domain_ops,
+	.chip		= &dmar_msi_controller,
+};
 
-		ret = setup_msi_irq(dev, msidesc, irq, 0);
-		if (ret < 0) {
-			irq_free_hwirq(irq);
-			return ret;
-		}
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+	static struct irq_domain *dmar_domain;
+	static DEFINE_MUTEX(dmar_lock);
 
-	}
-	return 0;
-}
+	mutex_lock(&dmar_lock);
+	if (dmar_domain == NULL)
+		dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+						    x86_vector_domain);
+	mutex_unlock(&dmar_lock);
 
-void native_teardown_msi_irq(unsigned int irq)
-{
-	irq_free_hwirq(irq);
+	return dmar_domain;
 }
 
-#ifdef CONFIG_DMAR_TABLE
-static int
-dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      bool force)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest, irq = data->irq;
-	struct msi_msg msg;
-	int ret;
-
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	struct irq_domain *domain = dmar_get_irq_domain();
+	struct irq_alloc_info info;
 
-	dmar_msi_read(irq, &msg);
+	if (!domain)
+		return -1;
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-	msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+	info.dmar_id = id;
+	info.dmar_data = arg;
 
-	dmar_msi_write(irq, &msg);
-
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return irq_domain_alloc_irqs(domain, 1, node, &info);
 }
 
-static struct irq_chip dmar_msi_type = {
-	.name			= "DMAR_MSI",
-	.irq_unmask		= dmar_msi_unmask,
-	.irq_mask		= dmar_msi_mask,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= dmar_msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+void dmar_free_hwirq(int irq)
 {
-	int ret;
-	struct msi_msg msg;
-
-	ret = msi_compose_msg(NULL, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq_domain_free_irqs(irq, 1);
 }
 #endif
 
@@ -231,56 +255,103 @@ int arch_setup_dmar_msi(unsigned int irq)
  * MSI message composition
  */
 #ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+	struct msi_domain_info *info = msi_get_domain_info(domain);
+
+	return (int)(long)info->data;
+}
 
-static int hpet_msi_set_affinity(struct irq_data *data,
-				 const struct cpumask *mask, bool force)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	hpet_msi_write(data->handler_data, msg);
+}
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+static struct irq_chip hpet_msi_controller = {
+	.name = "HPET-MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_set_affinity = msi_domain_set_affinity,
+	.irq_retrigger = irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg = irq_msi_compose_msg,
+	.irq_write_msi_msg = hpet_msi_write_msg,
+	.flags = IRQCHIP_SKIP_SET_WAKE,
+};
 
-	hpet_msi_read(data->handler_data, &msg);
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->hpet_index;
+}
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static int hpet_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+			    handle_edge_irq, arg->hpet_data, "edge");
 
-	hpet_msi_write(data->handler_data, &msg);
+	return 0;
+}
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+static void hpet_msi_free(struct irq_domain *domain,
+			  struct msi_domain_info *info, unsigned int virq)
+{
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
 }
 
-static struct irq_chip hpet_msi_type = {
-	.name = "HPET_MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
-	.irq_ack = apic_ack_edge,
-	.irq_set_affinity = hpet_msi_set_affinity,
-	.irq_retrigger = apic_retrigger_irq,
-	.flags = IRQCHIP_SKIP_SET_WAKE,
+static struct msi_domain_ops hpet_msi_domain_ops = {
+	.get_hwirq	= hpet_msi_get_hwirq,
+	.msi_init	= hpet_msi_init,
+	.msi_free	= hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+	.ops		= &hpet_msi_domain_ops,
+	.chip		= &hpet_msi_controller,
 };
 
-int default_setup_hpet_msi(unsigned int irq, unsigned int id)
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
 {
-	struct irq_chip *chip = &hpet_msi_type;
-	struct msi_msg msg;
-	int ret;
+	struct irq_domain *parent;
+	struct irq_alloc_info info;
+	struct msi_domain_info *domain_info;
+
+	if (x86_vector_domain == NULL)
+		return NULL;
+
+	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+	if (!domain_info)
+		return NULL;
+
+	*domain_info = hpet_msi_domain_info;
+	domain_info->data = (void *)(long)hpet_id;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_id = hpet_id;
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (parent == NULL)
+		parent = x86_vector_domain;
+	else
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
+	return msi_create_irq_domain(NULL, domain_info, parent);
+}
 
-	ret = msi_compose_msg(NULL, irq, &msg, id);
-	if (ret < 0)
-		return ret;
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+		    int dev_num)
+{
+	struct irq_alloc_info info;
 
-	hpet_msi_write(irq_get_handler_data(irq), &msg);
-	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_data = dev;
+	info.hpet_id = hpet_dev_id(domain);
+	info.hpet_index = dev_num;
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
-	return 0;
+	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
 }
 #endif
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 6cedd7914581..28eba2d38b15 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -3,6 +3,8 @@
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Enable support of hierarchical irqdomains
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -11,15 +13,28 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/desc.h>
 #include <asm/irq_remapping.h>
 
+struct apic_chip_data {
+	struct irq_cfg		cfg;
+	cpumask_var_t		domain;
+	cpumask_var_t		old_domain;
+	u8			move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef	CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
 
 void lock_vector_lock(void)
 {
@@ -34,71 +49,59 @@ void unlock_vector_lock(void)
 	raw_spin_unlock(&vector_lock);
 }
 
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
 {
-	return irq_get_chip_data(irq);
+	if (!irq_data)
+		return NULL;
+
+	while (irq_data->parent_data)
+		irq_data = irq_data->parent_data;
+
+	return irq_data->chip_data;
 }
 
 struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 {
-	return irq_data->chip_data;
+	struct apic_chip_data *data = apic_chip_data(irq_data);
+
+	return data ? &data->cfg : NULL;
 }
 
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+struct irq_cfg *irq_cfg(unsigned int irq)
 {
-	struct irq_cfg *cfg;
+	return irqd_cfg(irq_get_irq_data(irq));
+}
 
-	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
-	if (!cfg)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+	struct apic_chip_data *data;
+
+	data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+	if (!data)
 		return NULL;
-	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
-		goto out_cfg;
-	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+	if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+		goto out_data;
+	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
 		goto out_domain;
-#ifdef	CONFIG_X86_IO_APIC
-	INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
-	return cfg;
+	return data;
 out_domain:
-	free_cpumask_var(cfg->domain);
-out_cfg:
-	kfree(cfg);
+	free_cpumask_var(data->domain);
+out_data:
+	kfree(data);
 	return NULL;
 }
 
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
 {
-	int res = irq_alloc_desc_at(at, node);
-	struct irq_cfg *cfg;
-
-	if (res < 0) {
-		if (res != -EEXIST)
-			return NULL;
-		cfg = irq_cfg(at);
-		if (cfg)
-			return cfg;
+	if (data) {
+		free_cpumask_var(data->domain);
+		free_cpumask_var(data->old_domain);
+		kfree(data);
 	}
-
-	cfg = alloc_irq_cfg(at, node);
-	if (cfg)
-		irq_set_chip_data(at, cfg);
-	else
-		irq_free_desc(at);
-	return cfg;
-}
-
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
-	if (!cfg)
-		return;
-	irq_set_chip_data(at, NULL);
-	free_cpumask_var(cfg->domain);
-	free_cpumask_var(cfg->old_domain);
-	kfree(cfg);
 }
 
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+			       const struct cpumask *mask)
 {
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -114,36 +117,33 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
 	static int current_offset = VECTOR_OFFSET_START % 16;
 	int cpu, err;
-	cpumask_var_t tmp_mask;
 
-	if (cfg->move_in_progress)
+	if (d->move_in_progress)
 		return -EBUSY;
 
-	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
-		return -ENOMEM;
-
 	/* Only try and allocate irqs on cpus that are present */
 	err = -ENOSPC;
-	cpumask_clear(cfg->old_domain);
+	cpumask_clear(d->old_domain);
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	while (cpu < nr_cpu_ids) {
 		int new_cpu, vector, offset;
 
-		apic->vector_allocation_domain(cpu, tmp_mask, mask);
+		apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
-		if (cpumask_subset(tmp_mask, cfg->domain)) {
+		if (cpumask_subset(vector_cpumask, d->domain)) {
 			err = 0;
-			if (cpumask_equal(tmp_mask, cfg->domain))
+			if (cpumask_equal(vector_cpumask, d->domain))
 				break;
 			/*
 			 * New cpumask using the vector is a proper subset of
 			 * the current in use mask. So cleanup the vector
 			 * allocation for the members that are not used anymore.
 			 */
-			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
-			cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+			cpumask_andnot(d->old_domain, d->domain,
+				       vector_cpumask);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
+			cpumask_and(d->domain, d->domain, vector_cpumask);
 			break;
 		}
 
@@ -157,16 +157,18 @@ next:
 		}
 
 		if (unlikely(current_vector == vector)) {
-			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
-			cpumask_andnot(tmp_mask, mask, cfg->old_domain);
-			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+			cpumask_or(d->old_domain, d->old_domain,
+				   vector_cpumask);
+			cpumask_andnot(vector_cpumask, mask, d->old_domain);
+			cpu = cpumask_first_and(vector_cpumask,
+						cpu_online_mask);
 			continue;
 		}
 
 		if (test_bit(vector, used_vectors))
 			goto next;
 
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
 			if (per_cpu(vector_irq, new_cpu)[vector] >
 			    VECTOR_UNDEFINED)
 				goto next;
@@ -174,55 +176,73 @@ next:
 		/* Found one! */
 		current_vector = vector;
 		current_offset = offset;
-		if (cfg->vector) {
-			cpumask_copy(cfg->old_domain, cfg->domain);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
+		if (d->cfg.vector) {
+			cpumask_copy(d->old_domain, d->domain);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
-		cfg->vector = vector;
-		cpumask_copy(cfg->domain, tmp_mask);
+		d->cfg.vector = vector;
+		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
 		break;
 	}
-	free_cpumask_var(tmp_mask);
+
+	if (!err) {
+		/* cache destination APIC IDs into cfg->dest_apicid */
+		err = apic->cpu_mask_to_apicid_and(mask, d->domain,
+						   &d->cfg.dest_apicid);
+	}
 
 	return err;
 }
 
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+			     const struct cpumask *mask)
 {
 	int err;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, cfg, mask);
+	err = __assign_irq_vector(irq, data, mask);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 }
 
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+				    struct apic_chip_data *data,
+				    struct irq_alloc_info *info)
+{
+	if (info && info->mask)
+		return assign_irq_vector(irq, data, info->mask);
+	if (node != NUMA_NO_NODE &&
+	    assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+		return 0;
+	return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
 	int cpu, vector;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	BUG_ON(!cfg->vector);
+	BUG_ON(!data->cfg.vector);
 
-	vector = cfg->vector;
-	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+	vector = data->cfg.vector;
+	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
 		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 
-	cfg->vector = 0;
-	cpumask_clear(cfg->domain);
+	data->cfg.vector = 0;
+	cpumask_clear(data->domain);
 
-	if (likely(!cfg->move_in_progress)) {
+	if (likely(!data->move_in_progress)) {
 		raw_spin_unlock_irqrestore(&vector_lock, flags);
 		return;
 	}
 
-	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -231,10 +251,95 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg)
 			break;
 		}
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 }
 
+void init_irq_alloc_info(struct irq_alloc_info *info,
+			 const struct cpumask *mask)
+{
+	memset(info, 0, sizeof(*info));
+	info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+	if (src)
+		*dst = *src;
+	else
+		memset(dst, 0, sizeof(*dst));
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+				 unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+		if (irq_data && irq_data->chip_data) {
+			clear_irq_vector(virq + i, irq_data->chip_data);
+			free_apic_chip_data(irq_data->chip_data);
+#ifdef	CONFIG_X86_IO_APIC
+			if (virq + i < nr_legacy_irqs())
+				legacy_irq_data[virq + i] = NULL;
+#endif
+			irq_domain_reset_irq_data(irq_data);
+		}
+	}
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *arg)
+{
+	struct irq_alloc_info *info = arg;
+	struct apic_chip_data *data;
+	struct irq_data *irq_data;
+	int i, err;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	/* Currently vector allocator can't guarantee contiguous allocations */
+	if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+		return -ENOSYS;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		BUG_ON(!irq_data);
+#ifdef	CONFIG_X86_IO_APIC
+		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+			data = legacy_irq_data[virq + i];
+		else
+#endif
+			data = alloc_apic_chip_data(irq_data->node);
+		if (!data) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		irq_data->chip = &lapic_controller;
+		irq_data->chip_data = data;
+		irq_data->hwirq = virq + i;
+		err = assign_irq_vector_policy(virq, irq_data->node, data,
+					       info);
+		if (err)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	x86_vector_free_irqs(domain, virq, i + 1);
+	return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+	.alloc	= x86_vector_alloc_irqs,
+	.free	= x86_vector_free_irqs,
+};
+
 int __init arch_probe_nr_irqs(void)
 {
 	int nr;
@@ -258,8 +363,43 @@ int __init arch_probe_nr_irqs(void)
 	return nr_legacy_irqs();
 }
 
+#ifdef	CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+	int i, node = cpu_to_node(0);
+	struct apic_chip_data *data;
+
+	/*
+	 * For legacy IRQ's, start with assigning irq0 to irq15 to
+	 * ISA_IRQ_VECTOR(i) for all cpu's.
+	 */
+	for (i = 0; i < nr_legacy_irqs(); i++) {
+		data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+		BUG_ON(!data);
+
+		data->cfg.vector = ISA_IRQ_VECTOR(i);
+		cpumask_setall(data->domain);
+		irq_set_chip_data(i, data);
+	}
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
 int __init arch_early_irq_init(void)
 {
+	init_legacy_irqs();
+
+	x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+						NULL);
+	BUG_ON(x86_vector_domain == NULL);
+	irq_set_default_host(x86_vector_domain);
+
+	arch_init_msi_domain(x86_vector_domain);
+	arch_init_htirq_domain(x86_vector_domain);
+
+	BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+
 	return arch_early_ioapic_init();
 }
 
@@ -267,7 +407,7 @@ static void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
 	int irq, vector;
-	struct irq_cfg *cfg;
+	struct apic_chip_data *data;
 
 	/*
 	 * vector_lock will make sure that we don't run into irq vector
@@ -277,13 +417,13 @@ static void __setup_vector_irq(int cpu)
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	for_each_active_irq(irq) {
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!data)
 			continue;
 
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		if (!cpumask_test_cpu(cpu, data->domain))
 			continue;
-		vector = cfg->vector;
+		vector = data->cfg.vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
 	}
 	/* Mark the free vectors */
@@ -292,8 +432,8 @@ static void __setup_vector_irq(int cpu)
 		if (irq <= VECTOR_UNDEFINED)
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!cpumask_test_cpu(cpu, data->domain))
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 	}
 	raw_spin_unlock(&vector_lock);
@@ -314,20 +454,20 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
 
 	__setup_vector_irq(cpu);
 }
 
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
+	struct apic_chip_data *data = apic_chip_data(irq_data);
 	unsigned long flags;
 	int cpu;
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
-	apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+	cpu = cpumask_first_and(data->domain, cpu_online_mask);
+	apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
@@ -340,73 +480,76 @@ void apic_ack_edge(struct irq_data *data)
 	ack_APIC_irq();
 }
 
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+			     const struct cpumask *dest, bool force)
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int irq = data->irq;
-	int err;
+	struct apic_chip_data *data = irq_data->chip_data;
+	int err, irq = irq_data->irq;
 
 	if (!config_enabled(CONFIG_SMP))
 		return -EPERM;
 
-	if (!cpumask_intersects(mask, cpu_online_mask))
+	if (!cpumask_intersects(dest, cpu_online_mask))
 		return -EINVAL;
 
-	err = assign_irq_vector(irq, cfg, mask);
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+	err = assign_irq_vector(irq, data, dest);
 	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
+		struct irq_data *top = irq_get_irq_data(irq);
+
+		if (assign_irq_vector(irq, data, top->affinity))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 	}
 
-	cpumask_copy(data->affinity, mask);
-
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 
+static struct irq_chip lapic_controller = {
+	.irq_ack		= apic_ack_edge,
+	.irq_set_affinity	= apic_set_affinity,
+	.irq_retrigger		= apic_retrigger_irq,
+};
+
 #ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void __send_cleanup_vector(struct apic_chip_data *data)
 {
 	cpumask_var_t cleanup_mask;
 
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 		unsigned int i;
 
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+		for_each_cpu_and(i, data->old_domain, cpu_online_mask)
 			apic->send_IPI_mask(cpumask_of(i),
 					    IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	struct apic_chip_data *data;
+
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (data->move_in_progress)
+		__send_cleanup_vector(data);
 }
 
 asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 
-	ack_APIC_irq();
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		int irq;
 		unsigned int irr;
 		struct irq_desc *desc;
-		struct irq_cfg *cfg;
+		struct apic_chip_data *data;
 
 		irq = __this_cpu_read(vector_irq[vector]);
 
@@ -417,8 +560,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		if (!desc)
 			continue;
 
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(&desc->irq_data);
+		if (!data)
 			continue;
 
 		raw_spin_lock(&desc->lock);
@@ -427,10 +570,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		 * Check if the irq migration is in progress. If so, we
 		 * haven't received the cleanup request yet for this irq.
 		 */
-		if (cfg->move_in_progress)
+		if (data->move_in_progress)
 			goto unlock;
 
-		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+		if (vector == data->cfg.vector &&
+		    cpumask_test_cpu(me, data->domain))
 			goto unlock;
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -450,20 +594,21 @@ unlock:
 		raw_spin_unlock(&desc->lock);
 	}
 
-	irq_exit();
+	exiting_irq();
 }
 
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
 	unsigned me;
+	struct apic_chip_data *data;
 
-	if (likely(!cfg->move_in_progress))
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (likely(!data->move_in_progress))
 		return;
 
 	me = smp_processor_id();
-
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
-		send_cleanup_vector(cfg);
+	if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+		__send_cleanup_vector(data);
 }
 
 void irq_complete_move(struct irq_cfg *cfg)
@@ -475,46 +620,11 @@ void irq_force_complete_move(int irq)
 {
 	struct irq_cfg *cfg = irq_cfg(irq);
 
-	if (!cfg)
-		return;
-
-	__irq_complete_move(cfg, cfg->vector);
+	if (cfg)
+		__irq_complete_move(cfg, cfg->vector);
 }
 #endif
 
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	int ret;
-
-	cfg = alloc_irq_cfg(irq, node);
-	if (!cfg)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
-
-	if (!ret)
-		irq_set_chip_data(irq, cfg);
-	else
-		free_irq_cfg(irq, cfg);
-	return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	free_remapped_irq(irq);
-	clear_irq_vector(irq, cfg);
-	free_irq_cfg(irq, cfg);
-}
-
 static void __init print_APIC_field(int base)
 {
 	int i;
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 6fae733e9194..3ffd925655e0 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
 
 static bool x2apic_fadt_phys(void)
 {
+#ifdef CONFIG_ACPI
 	if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		printk(KERN_DEBUG "System requires x2apic physical mode\n");
 		return true;
 	}
+#endif
 	return false;
 }
 
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 9f6b9341950f..8e3d22a1af94 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -41,6 +41,25 @@ void common(void) {
 	OFFSET(pbe_orig_address, pbe, orig_address);
 	OFFSET(pbe_next, pbe, next);
 
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	BLANK();
+	OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax);
+	OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx);
+	OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx);
+	OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx);
+	OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si);
+	OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di);
+	OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp);
+	OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp);
+	OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip);
+
+	BLANK();
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+
+	BLANK();
+	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 	BLANK();
 	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
@@ -49,7 +68,9 @@ void common(void) {
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+#ifdef CONFIG_X86_32
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+#endif
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 47703aed74cf..6ce39025f467 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -17,17 +17,6 @@ void foo(void);
 
 void foo(void)
 {
-	OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
-	OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
-	OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
-	OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
-	OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
-	OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
-	OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
-	OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
-	OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
-	BLANK();
-
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
 	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
 	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
@@ -37,10 +26,6 @@ void foo(void)
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
 	BLANK();
 
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	OFFSET(TI_cpu, thread_info, cpu);
-	BLANK();
-
 	OFFSET(PT_EBX, pt_regs, bx);
 	OFFSET(PT_ECX, pt_regs, cx);
 	OFFSET(PT_EDX, pt_regs, dx);
@@ -60,9 +45,6 @@ void foo(void)
 	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
-	BLANK();
-
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	BLANK();
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 5ce6f2da8763..d8f42f902a0f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -29,27 +29,6 @@ int main(void)
 	BLANK();
 #endif
 
-#ifdef CONFIG_IA32_EMULATION
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	BLANK();
-
-#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
-	ENTRY(ax);
-	ENTRY(bx);
-	ENTRY(cx);
-	ENTRY(dx);
-	ENTRY(si);
-	ENTRY(di);
-	ENTRY(bp);
-	ENTRY(sp);
-	ENTRY(ip);
-	BLANK();
-#undef ENTRY
-
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
-	BLANK();
-#endif
-
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(cx);
@@ -87,7 +66,7 @@ int main(void)
 	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
 	DEFINE(NR_syscalls, sizeof(syscalls_64));
 
-	DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+	DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
 	DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e4cf63301ff4..dd3a4baffe50 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -19,6 +19,13 @@
 
 #include "cpu.h"
 
+/*
+ * nodes_per_socket: Stores the number of nodes per socket.
+ * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
+ * Node Identifiers[10:8]
+ */
+static u32 nodes_per_socket = 1;
+
 static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
 {
 	u32 gprs[8] = { 0 };
@@ -288,10 +295,10 @@ static int nearby_node(int apicid)
  *     Assumption: Number of cores in each internal node is the same.
  * (2) AMD processors supporting compute units
  */
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 static void amd_get_topology(struct cpuinfo_x86 *c)
 {
-	u32 nodes, cores_per_cu = 1;
+	u32 cores_per_cu = 1;
 	u8 node_id;
 	int cpu = smp_processor_id();
 
@@ -300,7 +307,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u32 eax, ebx, ecx, edx;
 
 		cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
-		nodes = ((ecx >> 8) & 7) + 1;
+		nodes_per_socket = ((ecx >> 8) & 7) + 1;
 		node_id = ecx & 7;
 
 		/* get compute unit information */
@@ -311,18 +318,18 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
 		u64 value;
 
 		rdmsrl(MSR_FAM10H_NODE_ID, value);
-		nodes = ((value >> 3) & 7) + 1;
+		nodes_per_socket = ((value >> 3) & 7) + 1;
 		node_id = value & 7;
 	} else
 		return;
 
 	/* fixup multi-node processor information */
-	if (nodes > 1) {
+	if (nodes_per_socket > 1) {
 		u32 cores_per_node;
 		u32 cus_per_node;
 
 		set_cpu_cap(c, X86_FEATURE_AMD_DCM);
-		cores_per_node = c->x86_max_cores / nodes;
+		cores_per_node = c->x86_max_cores / nodes_per_socket;
 		cus_per_node = cores_per_node / cores_per_cu;
 
 		/* store NodeID, use llc_shared_map to store sibling info */
@@ -341,7 +348,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
  */
 static void amd_detect_cmp(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	unsigned bits;
 	int cpu = smp_processor_id();
 
@@ -366,6 +373,12 @@ u16 amd_get_nb_id(int cpu)
 }
 EXPORT_SYMBOL_GPL(amd_get_nb_id);
 
+u32 amd_get_nodes_per_socket(void)
+{
+	return nodes_per_socket;
+}
+EXPORT_SYMBOL_GPL(amd_get_nodes_per_socket);
+
 static void srat_detect_node(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_NUMA
@@ -420,7 +433,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 
 static void early_init_amd_mc(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	unsigned bits, ecx;
 
 	/* Multi core CPU? */
@@ -520,8 +533,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
-	/* check CPU config space for extended APIC ID */
-	if (cpu_has_apic && c->x86 >= 0xf) {
+	/*
+	 * ApicID can always be treated as an 8-bit value for AMD APIC versions
+	 * >= 0x10, but even old K8s came out of reset with version 0x10. So, we
+	 * can safely set X86_FEATURE_EXTD_APICID unconditionally for families
+	 * after 16h.
+	 */
+	if (cpu_has_apic && c->x86 > 0x16) {
+		set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+	} else if (cpu_has_apic && c->x86 >= 0xf) {
+		/* check CPU config space for extended APIC ID */
 		unsigned int val;
 		val = read_pci_config(0, 24, 0, 0x68);
 		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 03445346ee0a..bd17db15a2c1 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -12,57 +12,11 @@
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 #include <asm/msr.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
 
-static double __initdata x = 4195835.0;
-static double __initdata y = 3145727.0;
-
-/*
- * This used to check for exceptions..
- * However, it turns out that to support that,
- * the XMM trap handlers basically had to
- * be buggy. So let's have a correct XMM trap
- * handler, and forget about printing out
- * some status at boot.
- *
- * We should really only care about bugs here
- * anyway. Not features.
- */
-static void __init check_fpu(void)
-{
-	s32 fdiv_bug;
-
-	kernel_fpu_begin();
-
-	/*
-	 * trap_init() enabled FXSR and company _before_ testing for FP
-	 * problems here.
-	 *
-	 * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
-	 */
-	__asm__("fninit\n\t"
-		"fldl %1\n\t"
-		"fdivl %2\n\t"
-		"fmull %2\n\t"
-		"fldl %1\n\t"
-		"fsubp %%st,%%st(1)\n\t"
-		"fistpl %0\n\t"
-		"fwait\n\t"
-		"fninit"
-		: "=m" (*&fdiv_bug)
-		: "m" (*&x), "m" (*&y));
-
-	kernel_fpu_end();
-
-	if (fdiv_bug) {
-		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
-		pr_warn("Hmm, FPU with FDIV bug\n");
-	}
-}
-
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -85,10 +39,5 @@ void __init check_bugs(void)
 		'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
 	alternative_instructions();
 
-	/*
-	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
-	 * alternative instructions.
-	 */
-	if (cpu_has_fpu)
-		check_fpu();
+	fpu__init_check_bugs();
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a62cf04dac8a..9fc5e3d9d9c8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/string.h>
+#include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -31,8 +32,7 @@
 #include <asm/setup.h>
 #include <asm/apic.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mtrr.h>
 #include <linux/numa.h>
 #include <asm/asm.h>
@@ -145,32 +145,21 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 } };
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
-static int __init x86_xsave_setup(char *s)
+static int __init x86_mpx_setup(char *s)
 {
+	/* require an exact match without trailing characters */
 	if (strlen(s))
 		return 0;
-	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	return 1;
-}
-__setup("noxsave", x86_xsave_setup);
 
-static int __init x86_xsaveopt_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	return 1;
-}
-__setup("noxsaveopt", x86_xsaveopt_setup);
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_MPX))
+		return 1;
 
-static int __init x86_xsaves_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
 	return 1;
 }
-__setup("noxsaves", x86_xsaves_setup);
+__setup("nompx", x86_mpx_setup);
 
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
@@ -183,14 +172,6 @@ static int __init cachesize_setup(char *str)
 }
 __setup("cachesize=", cachesize_setup);
 
-static int __init x86_fxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-	return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
 static int __init x86_sep_setup(char *s)
 {
 	setup_clear_cpu_cap(X86_FEATURE_SEP);
@@ -419,7 +400,7 @@ static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
 static void get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
-	char *p, *q;
+	char *p, *q, *s;
 
 	if (c->extended_cpuid_level < 0x80000004)
 		return;
@@ -430,19 +411,21 @@ static void get_model_name(struct cpuinfo_x86 *c)
 	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
 	c->x86_model_id[48] = 0;
 
-	/*
-	 * Intel chips right-justify this string for some dumb reason;
-	 * undo that brain damage:
-	 */
-	p = q = &c->x86_model_id[0];
+	/* Trim whitespace */
+	p = q = s = &c->x86_model_id[0];
+
 	while (*p == ' ')
 		p++;
-	if (p != q) {
-		while (*p)
-			*q++ = *p++;
-		while (q <= &c->x86_model_id[48])
-			*q++ = '\0';	/* Zero-pad the rest */
+
+	while (*p) {
+		/* Note the last non-whitespace index */
+		if (!isspace(*p))
+			s = q;
+
+		*q++ = *p++;
 	}
+
+	*(s + 1) = '\0';
 }
 
 void cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
@@ -508,7 +491,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
 
 void detect_ht(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	u32 eax, ebx, ecx, edx;
 	int index_msb, core_bits;
 	static bool printed;
@@ -759,7 +742,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	cpu_detect(c);
 	get_cpu_vendor(c);
 	get_cpu_cap(c);
-	fpu_detect(c);
+	fpu__init_system(c);
 
 	if (this_cpu->c_early_init)
 		this_cpu->c_early_init(c);
@@ -844,7 +827,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	if (c->cpuid_level >= 0x00000001) {
 		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
 #ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
 		c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 # else
 		c->apicid = c->initial_apicid;
@@ -1026,7 +1009,7 @@ void enable_sep_cpu(void)
 	      (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
 	      0);
 
-	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
 
 out:
 	put_cpu();
@@ -1122,7 +1105,7 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		printk(KERN_CONT "%s ", vendor);
 
 	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", strim(c->x86_model_id));
+		printk(KERN_CONT "%s", c->x86_model_id);
 	else
 		printk(KERN_CONT "%d86", c->x86);
 
@@ -1155,10 +1138,6 @@ static __init int setup_disablecpuid(char *arg)
 }
 __setup("clearcpuid=", setup_disablecpuid);
 
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
 struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
@@ -1183,8 +1162,6 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
-
 /*
  * Special IST stacks which the CPU switches to when it calls
  * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1208,10 +1185,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1220,7 +1197,7 @@ void syscall_init(void)
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
 	wrmsrl(MSR_CSTAR, ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
@@ -1275,7 +1252,6 @@ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
-DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
@@ -1439,7 +1415,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 
 	if (is_uv_system())
 		uv_cpu_init();
@@ -1495,7 +1471,7 @@ void cpu_init(void)
 	clear_all_debug_regs();
 	dbg_restore_debug_regs();
 
-	fpu_init();
+	fpu__init_cpu();
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index edcb0e28c336..be4febc58b94 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
 	unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
 	unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	unsigned int cpu = c->cpu_index;
 #endif
 
@@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 	if (new_l2) {
 		l2 = new_l2;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 		per_cpu(cpu_llc_id, cpu) = l2_id;
 #endif
 	}
 
 	if (new_l3) {
 		l3 = new_l3;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 		per_cpu(cpu_llc_id, cpu) = l3_id;
 #endif
 	}
 
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
 	/*
 	 * If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
 	 * turns means that the only possibility is SMT (as indicated in
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e535533d5ab8..df919ff103c3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -53,9 +53,12 @@
 static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
-	rcu_dereference_index_check((p), \
-			      rcu_read_lock_sched_held() || \
-			      lockdep_is_held(&mce_chrdev_read_mutex))
+({ \
+	rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+			   lockdep_is_held(&mce_chrdev_read_mutex), \
+			   "suspicious rcu_dereference_check_mce() usage"); \
+	smp_load_acquire(&(p)); \
+})
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -708,6 +711,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 			  struct pt_regs *regs)
 {
 	int i, ret = 0;
+	char *tmp;
 
 	for (i = 0; i < mca_cfg.banks; i++) {
 		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
@@ -716,9 +720,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
 			if (quirk_no_way_out)
 				quirk_no_way_out(i, m, regs);
 		}
-		if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
-		    MCE_PANIC_SEVERITY)
+
+		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+			*msg = tmp;
 			ret = 1;
+		}
 	}
 	return ret;
 }
@@ -1047,6 +1053,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	char *msg = "Unknown";
 	u64 recover_paddr = ~0ull;
 	int flags = MF_ACTION_REQUIRED;
+	int lmce = 0;
 
 	prev_state = ist_enter(regs);
 
@@ -1074,11 +1081,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		kill_it = 1;
 
 	/*
-	 * Go through all the banks in exclusion of the other CPUs.
-	 * This way we don't report duplicated events on shared banks
-	 * because the first one to see it will clear it.
+	 * Check if this MCE is signaled to only this logical processor
 	 */
-	order = mce_start(&no_way_out);
+	if (m.mcgstatus & MCG_STATUS_LMCES)
+		lmce = 1;
+	else {
+		/*
+		 * Go through all the banks in exclusion of the other CPUs.
+		 * This way we don't report duplicated events on shared banks
+		 * because the first one to see it will clear it.
+		 * If this is a Local MCE, then no need to perform rendezvous.
+		 */
+		order = mce_start(&no_way_out);
+	}
+
 	for (i = 0; i < cfg->banks; i++) {
 		__clear_bit(i, toclear);
 		if (!test_bit(i, valid_banks))
@@ -1155,8 +1171,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	 * Do most of the synchronization with other CPUs.
 	 * When there's any problem use only local no_way_out state.
 	 */
-	if (mce_end(order) < 0)
-		no_way_out = worst >= MCE_PANIC_SEVERITY;
+	if (!lmce) {
+		if (mce_end(order) < 0)
+			no_way_out = worst >= MCE_PANIC_SEVERITY;
+	} else {
+		/*
+		 * Local MCE skipped calling mce_reign()
+		 * If we found a fatal error, we need to panic here.
+		 */
+		 if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+			mce_panic("Machine check from unknown source",
+				NULL, NULL);
+	}
 
 	/*
 	 * At insane "tolerant" levels we take no action. Otherwise
@@ -1637,10 +1663,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 		mce_intel_feature_init(c);
 		mce_adjust_timer = cmci_intel_adjust_timer;
 		break;
-	case X86_VENDOR_AMD:
+
+	case X86_VENDOR_AMD: {
+		u32 ebx = cpuid_ebx(0x80000007);
+
 		mce_amd_feature_init(c);
-		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+		mce_flags.overflow_recov = !!(ebx & BIT(0));
+		mce_flags.succor	 = !!(ebx & BIT(1));
 		break;
+		}
+
 	default:
 		break;
 	}
@@ -1884,7 +1916,7 @@ out:
 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
 	poll_wait(file, &mce_chrdev_wait, wait);
-	if (rcu_access_index(mcelog.next))
+	if (READ_ONCE(mcelog.next))
 		return POLLIN | POLLRDNORM;
 	if (!mce_apei_read_done && apei_check_mce())
 		return POLLIN | POLLRDNORM;
@@ -1929,8 +1961,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
 }
 EXPORT_SYMBOL_GPL(register_mce_write_callback);
 
-ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
-			 size_t usize, loff_t *off)
+static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
+				size_t usize, loff_t *off)
 {
 	if (mce_write)
 		return mce_write(filp, ubuf, usize, off);
@@ -1976,6 +2008,7 @@ void mce_disable_bank(int bank)
 /*
  * mce=off Disables machine check
  * mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
@@ -1999,6 +2032,8 @@ static int __init mcheck_enable(char *str)
 		cfg->disabled = true;
 	else if (!strcmp(str, "no_cmci"))
 		cfg->cmci_disabled = true;
+	else if (!strcmp(str, "no_lmce"))
+		cfg->lmce_disabled = true;
 	else if (!strcmp(str, "dont_log_ce"))
 		cfg->dont_log_ce = true;
 	else if (!strcmp(str, "ignore_ce"))
@@ -2008,11 +2043,8 @@ static int __init mcheck_enable(char *str)
 	else if (!strcmp(str, "bios_cmci_threshold"))
 		cfg->bios_cmci_threshold = true;
 	else if (isdigit(str[0])) {
-		get_option(&str, &(cfg->tolerant));
-		if (*str == ',') {
-			++str;
+		if (get_option(&str, &cfg->tolerant) == 2)
 			get_option(&str, &(cfg->monarch_timeout));
-		}
 	} else {
 		pr_info("mce argument %s ignored. Please use /sys\n", str);
 		return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 55ad9b37cae8..e99b15077e94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,19 +1,13 @@
 /*
- *  (c) 2005-2012 Advanced Micro Devices, Inc.
+ *  (c) 2005-2015 Advanced Micro Devices, Inc.
  *  Your use of this code is subject to the terms and conditions of the
  *  GNU general public license version 2. See "COPYING" or
  *  http://www.gnu.org/licenses/gpl.html
  *
  *  Written by Jacob Shin - AMD, Inc.
- *
  *  Maintained by: Borislav Petkov <bp@alien8.de>
  *
- *  April 2006
- *     - added support for AMD Family 0x10 processors
- *  May 2012
- *     - major scrubbing
- *
- *  All MC4_MISCi registers are shared between multi-cores
+ *  All MC4_MISCi registers are shared between cores on a node.
  */
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
@@ -32,6 +26,7 @@
 #include <asm/idle.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
 
 #define NR_BLOCKS         9
 #define THRESHOLD_MAX     0xFFF
@@ -47,6 +42,13 @@
 #define MASK_BLKPTR_LO    0xFF000000
 #define MCG_XBLK_ADDR     0xC0000400
 
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR		0xC0000410
+#define MASK_DEF_LVTOFF		0x000000F0
+#define MASK_DEF_INT_TYPE	0x00000006
+#define DEF_LVT_OFF		0x2
+#define DEF_INT_TYPE_APIC	0x2
+
 static const char * const th_names[] = {
 	"load_store",
 	"insn_fetch",
@@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
 static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+	pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
 /*
  * CPU Initialization
@@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
 	threshold_restart_bank(&tr);
 };
 
-static int setup_APIC_mce(int reserved, int new)
+static int setup_APIC_mce_threshold(int reserved, int new)
 {
 	if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
 					      APIC_EILVT_MSG_FIX, 0))
@@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new)
 	return reserved;
 }
 
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+	if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+					      APIC_EILVT_MSG_FIX, 0))
+		return new;
+
+	return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+	u32 low = 0, high = 0;
+	int def_offset = -1, def_new;
+
+	if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+		return;
+
+	def_new = (low & MASK_DEF_LVTOFF) >> 4;
+	if (!(low & MASK_DEF_LVTOFF)) {
+		pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+		def_new = DEF_LVT_OFF;
+		low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+	}
+
+	def_offset = setup_APIC_deferred_error(def_offset, def_new);
+	if ((def_offset == def_new) &&
+	    (deferred_error_int_vector != amd_deferred_error_interrupt))
+		deferred_error_int_vector = amd_deferred_error_interrupt;
+
+	low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+	wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
 /* cpu init entry point, called from mce.c with preempt off */
 void mce_amd_feature_init(struct cpuinfo_x86 *c)
 {
@@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 			b.interrupt_enable = 1;
 			new	= (high & MASK_LVTOFF_HI) >> 20;
-			offset  = setup_APIC_mce(offset, new);
+			offset  = setup_APIC_mce_threshold(offset, new);
 
 			if ((offset == new) &&
 			    (mce_threshold_vector != amd_threshold_interrupt))
@@ -262,6 +304,73 @@ init:
 			mce_threshold_block_init(&b, offset);
 		}
 	}
+
+	if (mce_flags.succor)
+		deferred_error_interrupt_enable(c);
+}
+
+static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+{
+	struct mce m;
+	u64 status;
+
+	rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+	if (!(status & MCI_STATUS_VAL))
+		return;
+
+	mce_setup(&m);
+
+	m.status = status;
+	m.bank = bank;
+
+	if (threshold_err)
+		m.misc = misc;
+
+	if (m.status & MCI_STATUS_ADDRV)
+		rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+
+	mce_log(&m);
+	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+}
+
+static inline void __smp_deferred_error_interrupt(void)
+{
+	inc_irq_stat(irq_deferred_error_count);
+	deferred_error_int_vector();
+}
+
+asmlinkage __visible void smp_deferred_error_interrupt(void)
+{
+	entering_irq();
+	__smp_deferred_error_interrupt();
+	exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+{
+	entering_irq();
+	trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+	__smp_deferred_error_interrupt();
+	trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+	exiting_ack_irq();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+	u64 status;
+	unsigned int bank;
+
+	for (bank = 0; bank < mca_cfg.banks; ++bank) {
+		rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+
+		if (!(status & MCI_STATUS_VAL) ||
+		    !(status & MCI_STATUS_DEFERRED))
+			continue;
+
+		__log_error(bank, false, 0);
+		break;
+	}
 }
 
 /*
@@ -273,12 +382,12 @@ init:
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
+
 static void amd_threshold_interrupt(void)
 {
 	u32 low = 0, high = 0, address = 0;
 	int cpu = smp_processor_id();
 	unsigned int bank, block;
-	struct mce m;
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void)
 	return;
 
 log:
-	mce_setup(&m);
-	rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
-	if (!(m.status & MCI_STATUS_VAL))
-		return;
-	m.misc = ((u64)high << 32) | low;
-	m.bank = bank;
-	mce_log(&m);
-
-	wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+	__log_error(bank, true, ((u64)high << 32) | low);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b4a41cf030ed..844f56c5616d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -91,6 +91,36 @@ static int cmci_supported(int *banks)
 	return !!(cap & MCG_CMCI_P);
 }
 
+static bool lmce_supported(void)
+{
+	u64 tmp;
+
+	if (mca_cfg.lmce_disabled)
+		return false;
+
+	rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+	/*
+	 * LMCE depends on recovery support in the processor. Hence both
+	 * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+	 */
+	if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+		   (MCG_SER_P | MCG_LMCE_P))
+		return false;
+
+	/*
+	 * BIOS should indicate support for LMCE by setting bit 20 in
+	 * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+	 * generate a #GP fault.
+	 */
+	rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+	if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+		   (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+		return true;
+
+	return false;
+}
+
 bool mce_intel_cmci_poll(void)
 {
 	if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
@@ -405,8 +435,22 @@ static void intel_init_cmci(void)
 	cmci_recheck();
 }
 
+void intel_init_lmce(void)
+{
+	u64 val;
+
+	if (!lmce_supported())
+		return;
+
+	rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+	if (!(val & MCG_EXT_CTL_LMCE_EN))
+		wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 	intel_init_cmci();
+	intel_init_lmce();
 }
diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 737737edbd1e..e8a215a9a345 100644
--- a/arch/x86/kernel/cpu/microcode/amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -228,7 +228,23 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
 	}
 }
 
-void __init load_ucode_amd_bsp(void)
+static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
+					      unsigned int family)
+{
+#ifdef CONFIG_X86_64
+	char fw_name[36] = "amd-ucode/microcode_amd.bin";
+
+	if (family >= 0x15)
+		snprintf(fw_name, sizeof(fw_name),
+			 "amd-ucode/microcode_amd_fam%.2xh.bin", family);
+
+	return get_builtin_firmware(cp, fw_name);
+#else
+	return false;
+#endif
+}
+
+void __init load_ucode_amd_bsp(unsigned int family)
 {
 	struct cpio_data cp;
 	void **data;
@@ -243,8 +259,10 @@ void __init load_ucode_amd_bsp(void)
 #endif
 
 	cp = find_ucode_in_initrd();
-	if (!cp.data)
-		return;
+	if (!cp.data) {
+		if (!load_builtin_amd_microcode(&cp, family))
+			return;
+	}
 
 	*data = cp.data;
 	*size = cp.size;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 36a83617eb21..6236a54a63f4 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -1,74 +1,16 @@
 /*
- *	Intel CPU Microcode Update Driver for Linux
+ * CPU Microcode Update Driver for Linux
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *	      2006	Shaohua Li <shaohua.li@intel.com>
+ *	      2013-2015	Borislav Petkov <bp@alien8.de>
  *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
+ * This driver allows to upgrade microcode on x86 processors.
  *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index a413a69cbd74..8ebc421d6299 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -3,6 +3,7 @@
  *
  *	Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
  *			   H Peter Anvin" <hpa@zytor.com>
+ *		  (C) 2015 Borislav Petkov <bp@alien8.de>
  *
  *	This driver allows to early upgrade microcode on Intel processors
  *	belonging to IA-32 family - PentiumPro, Pentium II,
@@ -17,6 +18,7 @@
  *	2 of the License, or (at your option) any later version.
  */
 #include <linux/module.h>
+#include <linux/firmware.h>
 #include <asm/microcode.h>
 #include <asm/microcode_intel.h>
 #include <asm/microcode_amd.h>
@@ -43,9 +45,29 @@ static bool __init check_loader_disabled_bsp(void)
 	return *res;
 }
 
+extern struct builtin_fw __start_builtin_fw[];
+extern struct builtin_fw __end_builtin_fw[];
+
+bool get_builtin_firmware(struct cpio_data *cd, const char *name)
+{
+#ifdef CONFIG_FW_LOADER
+	struct builtin_fw *b_fw;
+
+	for (b_fw = __start_builtin_fw; b_fw != __end_builtin_fw; b_fw++) {
+		if (!strcmp(name, b_fw->name)) {
+			cd->size = b_fw->size;
+			cd->data = b_fw->data;
+			return true;
+		}
+	}
+#endif
+	return false;
+}
+
 void __init load_ucode_bsp(void)
 {
-	int vendor, family;
+	int vendor;
+	unsigned int family;
 
 	if (check_loader_disabled_bsp())
 		return;
@@ -63,7 +85,7 @@ void __init load_ucode_bsp(void)
 		break;
 	case X86_VENDOR_AMD:
 		if (family >= 0x10)
-			load_ucode_amd_bsp();
+			load_ucode_amd_bsp(family);
 		break;
 	default:
 		break;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index a41beadb3db9..969dc17eb1b4 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1,74 +1,13 @@
 /*
- *	Intel CPU Microcode Update Driver for Linux
+ * Intel CPU Microcode Update Driver for Linux
  *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
+ * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		 2006 Shaohua Li <shaohua.li@intel.com>
  *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/Assets/PDF/manual/253668.pdf	
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to
- *		speculative nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -124,7 +63,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 	cpf = cpu_sig.pf;
 	crev = cpu_sig.rev;
 
-	return get_matching_microcode(csig, cpf, crev, mc_intel);
+	return has_newer_microcode(mc_intel, csig, cpf, crev);
 }
 
 static int apply_microcode_intel(int cpu)
@@ -226,7 +165,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
 		csig = uci->cpu_sig.sig;
 		cpf = uci->cpu_sig.pf;
-		if (get_matching_microcode(csig, cpf, new_rev, mc)) {
+		if (has_newer_microcode(mc, csig, cpf, new_rev)) {
 			vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 2f49ab4ac0ae..8187b7247d1c 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -59,10 +59,10 @@ load_microcode_early(struct microcode_intel **saved,
 		ucode_ptr = saved[i];
 		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
 
-		ret = get_matching_microcode(uci->cpu_sig.sig,
-					     uci->cpu_sig.pf,
-					     new_rev,
-					     ucode_ptr);
+		ret = has_newer_microcode(ucode_ptr,
+					  uci->cpu_sig.sig,
+					  uci->cpu_sig.pf,
+					  new_rev);
 		if (!ret)
 			continue;
 
@@ -246,7 +246,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 			     u8 *ucode_ptr, unsigned int num_saved)
 {
 	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
-	unsigned int sig, pf, new_rev;
+	unsigned int sig, pf;
 	int found = 0, i;
 
 	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
@@ -255,14 +255,13 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
 		sig	     = mc_saved_hdr->sig;
 		pf	     = mc_saved_hdr->pf;
-		new_rev	     = mc_hdr->rev;
 
-		if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+		if (!find_matching_signature(ucode_ptr, sig, pf))
 			continue;
 
 		found = 1;
 
-		if (!revision_is_newer(mc_hdr, new_rev))
+		if (mc_hdr->rev <= mc_saved_hdr->rev)
 			continue;
 
 		/*
@@ -522,6 +521,27 @@ out:
 EXPORT_SYMBOL_GPL(save_mc_for_early);
 #endif
 
+static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+{
+#ifdef CONFIG_X86_64
+	unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+	unsigned int family, model, stepping;
+	char name[30];
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	family   = __x86_family(eax);
+	model    = x86_model(eax);
+	stepping = eax & 0xf;
+
+	sprintf(name, "intel-ucode/%02x-%02x-%02x", family, model, stepping);
+
+	return get_builtin_firmware(cp, name);
+#else
+	return false;
+#endif
+}
+
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
 scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
@@ -540,8 +560,10 @@ scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
 	cd.size = 0;
 
 	cd = find_cpio_data(p, (void *)start, size, &offset);
-	if (!cd.data)
-		return UCODE_ERROR;
+	if (!cd.data) {
+		if (!load_builtin_intel_microcode(&cd))
+			return UCODE_ERROR;
+	}
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
 					    mc_saved_data, initrd, uci);
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index cd47a510a3f1..1883d252ff7d 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -31,11 +31,18 @@
 #include <asm/processor.h>
 #include <asm/msr.h>
 
-static inline int
-update_match_cpu(unsigned int csig, unsigned int cpf,
-		 unsigned int sig, unsigned int pf)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+					unsigned int s2, unsigned int p2)
 {
-	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
+	if (s1 != s2)
+		return false;
+
+	/* Processor flags are either both 0 ... */
+	if (!p1 && !p2)
+		return true;
+
+	/* ... or they intersect. */
+	return p1 & p2;
 }
 
 int microcode_sanity_check(void *mc, int print_err)
@@ -124,27 +131,25 @@ EXPORT_SYMBOL_GPL(microcode_sanity_check);
 /*
  * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
+int find_matching_signature(void *mc, unsigned int csig, int cpf)
 {
-	struct microcode_header_intel *mc_header = mc;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	int ext_sigcount, i;
+	struct microcode_header_intel *mc_hdr = mc;
+	struct extended_sigtable *ext_hdr;
 	struct extended_signature *ext_sig;
+	int i;
 
-	if (update_match_cpu(csig, cpf, mc_header->sig, mc_header->pf))
+	if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
 		return 1;
 
 	/* Look for ext. headers: */
-	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+	if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
 		return 0;
 
-	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+	ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+	ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
 
-	for (i = 0; i < ext_sigcount; i++) {
-		if (update_match_cpu(csig, cpf, ext_sig->sig, ext_sig->pf))
+	for (i = 0; i < ext_hdr->count; i++) {
+		if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
 			return 1;
 		ext_sig++;
 	}
@@ -154,13 +159,13 @@ int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
 /*
  * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
+int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
 {
 	struct microcode_header_intel *mc_hdr = mc;
 
-	if (!revision_is_newer(mc_hdr, rev))
+	if (mc_hdr->rev <= new_rev)
 		return 0;
 
-	return get_matching_sig(csig, cpf, rev, mc);
+	return find_matching_signature(mc, csig, cpf);
 }
-EXPORT_SYMBOL_GPL(get_matching_microcode);
+EXPORT_SYMBOL_GPL(has_newer_microcode);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 939155ffdece..aad4bd84b475 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	irq_enter();
-	exit_idle();
-
+	entering_irq();
 	inc_irq_stat(irq_hv_callback_count);
 	if (vmbus_handler)
 		vmbus_handler();
 
-	irq_exit();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff22e..70d7c93f4550 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 			continue;
 		base = range_state[i].base_pfn;
 		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
-		    (mtrr_state.enabled & 1)) {
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
 			/* Var MTRR contains UC entry below 1M? Skip it: */
 			printk(BIOS_BUG_MSG, i);
 			if (base + size <= (1<<(20-PAGE_SHIFT)))
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d74f7b3c6ba..3b533cf37c74 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
 	return 0;
 }
 
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- *		corresponds only to [start:*partial_end].
- *		Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ *  0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ *  0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ *  0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+	int idx;
+
+	if (start >= 0x100000)
+		return MTRR_TYPE_INVALID;
+
+	/* 0x0 - 0x7FFFF */
+	if (start < 0x80000) {
+		idx = 0;
+		idx += (start >> 16);
+		return mtrr_state.fixed_ranges[idx];
+	/* 0x80000 - 0xBFFFF */
+	} else if (start < 0xC0000) {
+		idx = 1 * 8;
+		idx += ((start - 0x80000) >> 14);
+		return mtrr_state.fixed_ranges[idx];
+	}
+
+	/* 0xC0000 - 0xFFFFF */
+	idx = 3 * 8;
+	idx += ((start - 0xC0000) >> 12);
+	return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ *	    returned corresponds only to [start:*partial_end].  Caller has
+ *	    to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+				    int *repeat, u8 *uniform)
 {
 	int i;
 	u64 base, mask;
 	u8 prev_match, curr_match;
 
 	*repeat = 0;
-	if (!mtrr_state_set)
-		return 0xFF;
-
-	if (!mtrr_state.enabled)
-		return 0xFF;
+	*uniform = 1;
 
-	/* Make end inclusive end, instead of exclusive */
+	/* Make end inclusive instead of exclusive */
 	end--;
 
-	/* Look in fixed ranges. Just return the type as per start */
-	if (mtrr_state.have_fixed && (start < 0x100000)) {
-		int idx;
-
-		if (start < 0x80000) {
-			idx = 0;
-			idx += (start >> 16);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0xC0000) {
-			idx = 1 * 8;
-			idx += ((start - 0x80000) >> 14);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0x1000000) {
-			idx = 3 * 8;
-			idx += ((start - 0xC0000) >> 12);
-			return mtrr_state.fixed_ranges[idx];
-		}
-	}
-
-	/*
-	 * Look in variable ranges
-	 * Look of multiple ranges matching this address and pick type
-	 * as per MTRR precedence
-	 */
-	if (!(mtrr_state.enabled & 2))
-		return mtrr_state.def_type;
-
-	prev_match = 0xFF;
+	prev_match = MTRR_TYPE_INVALID;
 	for (i = 0; i < num_var_ranges; ++i) {
-		unsigned short start_state, end_state;
+		unsigned short start_state, end_state, inclusive;
 
 		if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
 			continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 		start_state = ((start & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
+		inclusive = ((start < base) && (end > base));
 
-		if (start_state != end_state) {
+		if ((start_state != end_state) || inclusive) {
 			/*
 			 * We have start:end spanning across an MTRR.
-			 * We split the region into
-			 * either
-			 * (start:mtrr_end) (mtrr_end:end)
-			 * or
-			 * (start:mtrr_start) (mtrr_start:end)
+			 * We split the region into either
+			 *
+			 * - start_state:1
+			 * (start:mtrr_end)(mtrr_end:end)
+			 * - end_state:1
+			 * (start:mtrr_start)(mtrr_start:end)
+			 * - inclusive:1
+			 * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+			 *
 			 * depending on kind of overlap.
-			 * Return the type for first region and a pointer to
-			 * the start of second region so that caller will
-			 * lookup again on the second region.
-			 * Note: This way we handle multiple overlaps as well.
+			 *
+			 * Return the type of the first region and a pointer
+			 * to the start of next region so that caller will be
+			 * advised to lookup again after having adjusted start
+			 * and end.
+			 *
+			 * Note: This way we handle overlaps with multiple
+			 * entries and the default type properly.
 			 */
 			if (start_state)
 				*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 			end = *partial_end - 1; /* end is inclusive */
 			*repeat = 1;
+			*uniform = 0;
 		}
 
 		if ((start & mask) != (base & mask))
 			continue;
 
 		curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
-		if (prev_match == 0xFF) {
+		if (prev_match == MTRR_TYPE_INVALID) {
 			prev_match = curr_match;
 			continue;
 		}
 
+		*uniform = 0;
 		if (check_type_overlap(&prev_match, &curr_match))
 			return curr_match;
 	}
 
-	if (mtrr_tom2) {
-		if (start >= (1ULL<<32) && (end < mtrr_tom2))
-			return MTRR_TYPE_WRBACK;
-	}
-
-	if (prev_match != 0xFF)
+	if (prev_match != MTRR_TYPE_INVALID)
 		return prev_match;
 
 	return mtrr_state.def_type;
 }
 
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
 {
-	u8 type, prev_type;
+	u8 type, prev_type, is_uniform = 1, dummy;
 	int repeat;
 	u64 partial_end;
 
-	type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+	if (!mtrr_state_set)
+		return MTRR_TYPE_INVALID;
+
+	if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+		return MTRR_TYPE_INVALID;
+
+	/*
+	 * Look up the fixed ranges first, which take priority over
+	 * the variable ranges.
+	 */
+	if ((start < 0x100000) &&
+	    (mtrr_state.have_fixed) &&
+	    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+		is_uniform = 0;
+		type = mtrr_type_lookup_fixed(start, end);
+		goto out;
+	}
+
+	/*
+	 * Look up the variable ranges.  Look of multiple ranges matching
+	 * this address and pick type as per MTRR precedence.
+	 */
+	type = mtrr_type_lookup_variable(start, end, &partial_end,
+					 &repeat, &is_uniform);
 
 	/*
 	 * Common path is with repeat = 0.
 	 * However, we can have cases where [start:end] spans across some
-	 * MTRR range. Do repeated lookups for that case here.
+	 * MTRR ranges and/or the default type.  Do repeated lookups for
+	 * that case here.
 	 */
 	while (repeat) {
 		prev_type = type;
 		start = partial_end;
-		type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+		is_uniform = 0;
+		type = mtrr_type_lookup_variable(start, end, &partial_end,
+						 &repeat, &dummy);
 
 		if (check_type_overlap(&prev_type, &type))
-			return type;
+			goto out;
 	}
 
+	if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+		type = MTRR_TYPE_WRBACK;
+
+out:
+	*uniform = is_uniform;
 	return type;
 }
 
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
 		 mtrr_attrib_to_str(mtrr_state.def_type));
 	if (mtrr_state.have_fixed) {
 		pr_debug("MTRR fixed ranges %sabled:\n",
-			 mtrr_state.enabled & 1 ? "en" : "dis");
+			((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+			 (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+			 "en" : "dis");
 		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
 		for (i = 0; i < 2; ++i)
 			print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
 		print_fixed_last();
 	}
 	pr_debug("MTRR variable ranges %sabled:\n",
-		 mtrr_state.enabled & 2 ? "en" : "dis");
+		 mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
 	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
 
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
 }
 
 /* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
 {
 	struct mtrr_var_range *vrs;
 	unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
 
 	post_set();
 	local_irq_restore(flags);
+
+	return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
 }
 
 /* Some BIOS's are messed up and don't set all MTRRs the same! */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ea5f363a1948..e7ed0d8ebacb 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -59,6 +59,12 @@
 #define MTRR_TO_PHYS_WC_OFFSET 1000
 
 u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+	return __mtrr_enabled;
+}
 
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	int i, replace, error;
 	mtrr_type ltype;
 
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return -ENXIO;
 
 	error = mtrr_if->validate_add_page(base, size, type);
@@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size)
 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	     bool increment)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	unsigned long lbase, lsize;
 	int error = -EINVAL;
 
-	if (!mtrr_if)
-		return -ENXIO;
+	if (!mtrr_enabled())
+		return -ENODEV;
 
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
@@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
  */
 int mtrr_del(int reg, unsigned long base, unsigned long size)
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del);
  * attempts to add a WC MTRR covering size bytes starting at base and
  * logs an error if this fails.
  *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
  * but drivers should not try to interpret that return value.
  */
@@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
 {
 	int ret;
 
-	if (pat_enabled)
+	if (pat_enabled() || !mtrr_enabled())
 		return 0;  /* Success!  (We don't need to do anything.) */
 
 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle)
 EXPORT_SYMBOL(arch_phys_wc_del);
 
 /*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
  * @handle: Return value from arch_phys_wc_add
  *
  * This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
  * in printk line.  Alas there is an illegitimate use in some ancient
  * drm ioctls.
  */
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
 {
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 		return -1;
 	else
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 }
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
 
 /*
  * HACK ALERT!
@@ -734,10 +747,12 @@ void __init mtrr_bp_init(void)
 	}
 
 	if (mtrr_if) {
+		__mtrr_enabled = true;
 		set_num_var_ranges();
 		init_table();
 		if (use_intel()) {
-			get_mtrr_state();
+			/* BIOS may override */
+			__mtrr_enabled = get_mtrr_state();
 
 			if (mtrr_cleanup(phys_addr)) {
 				changed_by_mtrr_cleanup = 1;
@@ -745,10 +760,16 @@ void __init mtrr_bp_init(void)
 			}
 		}
 	}
+
+	if (!mtrr_enabled())
+		pr_info("MTRR: Disabled\n");
 }
 
 void mtrr_ap_init(void)
 {
+	if (!mtrr_enabled())
+		return;
+
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
 	/*
@@ -774,6 +795,9 @@ void mtrr_save_state(void)
 {
 	int first_cpu;
 
+	if (!mtrr_enabled())
+		return;
+
 	get_online_cpus();
 	first_cpu = cpumask_first(cpu_online_mask);
 	smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +806,8 @@ void mtrr_save_state(void)
 
 void set_mtrr_aps_delayed_init(void)
 {
+	if (!mtrr_enabled())
+		return;
 	if (!use_intel())
 		return;
 
@@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void)
  */
 void mtrr_aps_init(void)
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 
 	/*
@@ -810,7 +836,7 @@ void mtrr_aps_init(void)
 
 void mtrr_bp_restore(void)
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 
 	mtrr_if->set_all();
@@ -818,7 +844,7 @@ void mtrr_bp_restore(void)
 
 static int __init mtrr_init_finialize(void)
 {
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return 0;
 
 	if (use_intel()) {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31a27..951884dcc433 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
 void fill_mtrr_var_range(unsigned int index,
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
 
 extern void set_mtrr_ops(const struct mtrr_ops *ops);
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 87848ebe2bb7..5801a14f7524 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -135,6 +135,7 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 }
 
 static atomic_t active_events;
+static atomic_t pmc_refcount;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -190,6 +191,7 @@ static bool check_hw_exists(void)
 	u64 val, val_fail, val_new= ~0;
 	int i, reg, reg_fail, ret = 0;
 	int bios_fail = 0;
+	int reg_safe = -1;
 
 	/*
 	 * Check to see if the BIOS enabled any of the counters, if so
@@ -204,6 +206,8 @@ static bool check_hw_exists(void)
 			bios_fail = 1;
 			val_fail = val;
 			reg_fail = reg;
+		} else {
+			reg_safe = i;
 		}
 	}
 
@@ -222,11 +226,22 @@ static bool check_hw_exists(void)
 	}
 
 	/*
+	 * If all the counters are enabled, the below test will always
+	 * fail.  The tools will also become useless in this scenario.
+	 * Just fail and disable the hardware counters.
+	 */
+
+	if (reg_safe == -1) {
+		reg = reg_safe;
+		goto msr_fail;
+	}
+
+	/*
 	 * Read the current value, change it and read it back to see if it
 	 * matches, this is needed to detect certain hardware emulators
 	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
 	 */
-	reg = x86_pmu_event_addr(0);
+	reg = x86_pmu_event_addr(reg_safe);
 	if (rdmsrl_safe(reg, &val))
 		goto msr_fail;
 	val ^= 0xffffUL;
@@ -256,11 +271,8 @@ msr_fail:
 
 static void hw_perf_event_destroy(struct perf_event *event)
 {
-	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
-		release_pmc_hardware();
-		release_ds_buffers();
-		mutex_unlock(&pmc_reserve_mutex);
-	}
+	x86_release_hardware();
+	atomic_dec(&active_events);
 }
 
 void hw_perf_lbr_event_destroy(struct perf_event *event)
@@ -310,6 +322,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+int x86_reserve_hardware(void)
+{
+	int err = 0;
+
+	if (!atomic_inc_not_zero(&pmc_refcount)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&pmc_refcount) == 0) {
+			if (!reserve_pmc_hardware())
+				err = -EBUSY;
+			else
+				reserve_ds_buffers();
+		}
+		if (!err)
+			atomic_inc(&pmc_refcount);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+
+	return err;
+}
+
+void x86_release_hardware(void)
+{
+	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		release_ds_buffers();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 /*
  * Check if we can create event of a certain type (that no conflicting events
  * are present).
@@ -322,21 +363,34 @@ int x86_add_exclusive(unsigned int what)
 		return 0;
 
 	mutex_lock(&pmc_reserve_mutex);
-	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
 		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
 			goto out;
+	}
 
 	atomic_inc(&x86_pmu.lbr_exclusive[what]);
 	ret = 0;
 
 out:
 	mutex_unlock(&pmc_reserve_mutex);
+
+	/*
+	 * Assuming that all exclusive events will share the PMI handler
+	 * (which checks active_events for whether there is work to do),
+	 * we can bump active_events counter right here, except for
+	 * x86_lbr_exclusive_lbr events that go through x86_pmu_event_init()
+	 * path, which already bumps active_events for them.
+	 */
+	if (!ret && what != x86_lbr_exclusive_lbr)
+		atomic_inc(&active_events);
+
 	return ret;
 }
 
 void x86_del_exclusive(unsigned int what)
 {
 	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+	atomic_dec(&active_events);
 }
 
 int x86_setup_perfctr(struct perf_event *event)
@@ -513,22 +567,11 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	if (!x86_pmu_initialized())
 		return -ENODEV;
 
-	err = 0;
-	if (!atomic_inc_not_zero(&active_events)) {
-		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&active_events) == 0) {
-			if (!reserve_pmc_hardware())
-				err = -EBUSY;
-			else
-				reserve_ds_buffers();
-		}
-		if (!err)
-			atomic_inc(&active_events);
-		mutex_unlock(&pmc_reserve_mutex);
-	}
+	err = x86_reserve_hardware();
 	if (err)
 		return err;
 
+	atomic_inc(&active_events);
 	event->destroy = hw_perf_event_destroy;
 
 	event->hw.idx = -1;
@@ -611,6 +654,7 @@ struct sched_state {
 	int	event;		/* event index */
 	int	counter;	/* counter index */
 	int	unassigned;	/* number of events to be assigned left */
+	int	nr_gp;		/* number of GP counters used */
 	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
@@ -620,27 +664,29 @@ struct sched_state {
 struct perf_sched {
 	int			max_weight;
 	int			max_events;
-	struct perf_event	**events;
-	struct sched_state	state;
+	int			max_gp;
 	int			saved_states;
+	struct event_constraint	**constraints;
+	struct sched_state	state;
 	struct sched_state	saved[SCHED_STATES_MAX];
 };
 
 /*
  * Initialize interator that runs through all events and counters.
  */
-static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
-			    int num, int wmin, int wmax)
+static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
+			    int num, int wmin, int wmax, int gpmax)
 {
 	int idx;
 
 	memset(sched, 0, sizeof(*sched));
 	sched->max_events	= num;
 	sched->max_weight	= wmax;
-	sched->events		= events;
+	sched->max_gp		= gpmax;
+	sched->constraints	= constraints;
 
 	for (idx = 0; idx < num; idx++) {
-		if (events[idx]->hw.constraint->weight == wmin)
+		if (constraints[idx]->weight == wmin)
 			break;
 	}
 
@@ -687,7 +733,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 	if (sched->state.event >= sched->max_events)
 		return false;
 
-	c = sched->events[sched->state.event]->hw.constraint;
+	c = sched->constraints[sched->state.event];
 	/* Prefer fixed purpose counters */
 	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 		idx = INTEL_PMC_IDX_FIXED;
@@ -696,11 +742,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 				goto done;
 		}
 	}
+
 	/* Grab the first unused counter starting with idx */
 	idx = sched->state.counter;
 	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
-		if (!__test_and_set_bit(idx, sched->state.used))
+		if (!__test_and_set_bit(idx, sched->state.used)) {
+			if (sched->state.nr_gp++ >= sched->max_gp)
+				return false;
+
 			goto done;
+		}
 	}
 
 	return false;
@@ -745,7 +796,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 			if (sched->state.weight > sched->max_weight)
 				return false;
 		}
-		c = sched->events[sched->state.event]->hw.constraint;
+		c = sched->constraints[sched->state.event];
 	} while (c->weight != sched->state.weight);
 
 	sched->state.counter = 0;	/* start with first counter */
@@ -756,12 +807,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 /*
  * Assign a counter for each event.
  */
-int perf_assign_events(struct perf_event **events, int n,
-			int wmin, int wmax, int *assign)
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign)
 {
 	struct perf_sched sched;
 
-	perf_sched_init(&sched, events, n, wmin, wmax);
+	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
 
 	do {
 		if (!perf_sched_find_counter(&sched))
@@ -788,9 +839,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 		x86_pmu.start_scheduling(cpuc);
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		hwc = &cpuc->event_list[i]->hw;
+		cpuc->event_constraint[i] = NULL;
 		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
-		hwc->constraint = c;
+		cpuc->event_constraint[i] = c;
 
 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
@@ -801,7 +852,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	 */
 	for (i = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = hwc->constraint;
+		c = cpuc->event_constraint[i];
 
 		/* never assigned */
 		if (hwc->idx == -1)
@@ -821,9 +872,26 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	}
 
 	/* slow path */
-	if (i != n)
-		unsched = perf_assign_events(cpuc->event_list, n, wmin,
-					     wmax, assign);
+	if (i != n) {
+		int gpmax = x86_pmu.num_counters;
+
+		/*
+		 * Do not allow scheduling of more than half the available
+		 * generic counters.
+		 *
+		 * This helps avoid counter starvation of sibling thread by
+		 * ensuring at most half the counters cannot be in exclusive
+		 * mode. There is no designated counters for the limits. Any
+		 * N/2 counters can be used. This helps with events with
+		 * specific counter constraints.
+		 */
+		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
+		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
+			gpmax /= 2;
+
+		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
+					     wmax, gpmax, assign);
+	}
 
 	/*
 	 * In case of success (unsched = 0), mark events as committed,
@@ -840,12 +908,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
 			if (x86_pmu.commit_scheduling)
-				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
+				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
 		}
-	}
-
-	if (!assign || unsched) {
-
+	} else {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -1058,13 +1123,16 @@ int x86_perf_event_set_period(struct perf_event *event)
 
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
-	/*
-	 * The hw event starts counting from this event offset,
-	 * mark it to be able to extra future deltas:
-	 */
-	local64_set(&hwc->prev_count, (u64)-left);
+	if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
+	    local64_read(&hwc->prev_count) != (u64)-left) {
+		/*
+		 * The hw event starts counting from this event offset,
+		 * mark it to be able to extra future deltas:
+		 */
+		local64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+		wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+	}
 
 	/*
 	 * Due to erratum on certan cpu we need
@@ -1292,8 +1360,10 @@ static void x86_pmu_del(struct perf_event *event, int flags)
 		x86_pmu.put_event_constraints(cpuc, event);
 
 	/* Delete the array entry. */
-	while (++i < cpuc->n_events)
+	while (++i < cpuc->n_events) {
 		cpuc->event_list[i-1] = cpuc->event_list[i];
+		cpuc->event_constraint[i-1] = cpuc->event_constraint[i];
+	}
 	--cpuc->n_events;
 
 	perf_event_update_userpage(event);
@@ -1374,6 +1444,10 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 	u64 finish_clock;
 	int ret;
 
+	/*
+	 * All PMUs/events that share this PMI handler should make sure to
+	 * increment active_events for their events.
+	 */
 	if (!atomic_read(&active_events))
 		return NMI_DONE;
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 6ac5cb7a9e14..3e7fd27dfe20 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -74,6 +74,9 @@ struct event_constraint {
 #define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
 #define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
+#define PERF_X86_EVENT_EXCL_ACCT	0x0200 /* accounted EXCL event */
+#define PERF_X86_EVENT_AUTO_RELOAD	0x0400 /* use PEBS auto-reload */
+#define PERF_X86_EVENT_FREERUNNING	0x0800 /* use freerunning PEBS */
 
 
 struct amd_nb {
@@ -87,6 +90,18 @@ struct amd_nb {
 #define MAX_PEBS_EVENTS		8
 
 /*
+ * Flags PEBS can handle without an PMI.
+ *
+ * TID can only be handled by flushing at context switch.
+ *
+ */
+#define PEBS_FREERUNNING_FLAGS \
+	(PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
+	PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
+	PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
+	PERF_SAMPLE_TRANSACTION)
+
+/*
  * A debug store configuration.
  *
  * We only support architectures that use 64bit fields.
@@ -132,10 +147,7 @@ enum intel_excl_state_type {
 };
 
 struct intel_excl_states {
-	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
-	int  num_alloc_cntrs;/* #counters allocated */
-	int  max_alloc_cntrs;/* max #counters allowed */
 	bool sched_started; /* true if scheduling has started */
 };
 
@@ -144,6 +156,11 @@ struct intel_excl_cntrs {
 
 	struct intel_excl_states states[2];
 
+	union {
+		u16	has_exclusive[2];
+		u32	exclusive_present;
+	};
+
 	int		refcnt;		/* per-core: #HT threads */
 	unsigned	core_id;	/* per-core: core id */
 };
@@ -172,7 +189,11 @@ struct cpu_hw_events {
 					     added in the current transaction */
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
+
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+	struct event_constraint	*event_constraint[X86_PMC_IDX_MAX];
+
+	int			n_excl; /* the number of exclusive events */
 
 	unsigned int		group_flag;
 	int			is_fake;
@@ -519,12 +540,10 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 
-	void		(*commit_scheduling)(struct cpu_hw_events *cpuc,
-					     struct perf_event *event,
-					     int cntr);
-
 	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
 
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr);
+
 	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
 
 	struct event_constraint *event_constraints;
@@ -697,6 +716,10 @@ int x86_add_exclusive(unsigned int what);
 
 void x86_del_exclusive(unsigned int what);
 
+int x86_reserve_hardware(void);
+
+void x86_release_hardware(void);
+
 void hw_perf_lbr_event_destroy(struct perf_event *event);
 
 int x86_setup_perfctr(struct perf_event *event);
@@ -717,8 +740,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 
 void x86_pmu_enable_all(int added);
 
-int perf_assign_events(struct perf_event **events, int n,
-			int wmin, int wmax, int *assign);
+int perf_assign_events(struct event_constraint **constraints, int n,
+			int wmin, int wmax, int gpmax, int *assign);
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
 void x86_pmu_stop(struct perf_event *event, int flags);
@@ -860,6 +883,8 @@ void intel_pmu_pebs_enable_all(void);
 
 void intel_pmu_pebs_disable_all(void);
 
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
+
 void intel_ds_init(void);
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
@@ -929,4 +954,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
 	return NULL;
 }
 
+static inline int is_ht_workaround_enabled(void)
+{
+	return 0;
+}
 #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3998131d1a68..b9826a981fb2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1903,9 +1903,8 @@ static void
 intel_start_scheduling(struct cpu_hw_events *cpuc)
 {
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
+	struct intel_excl_states *xl;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* sibling thread */
 
 	/*
 	 * nothing needed if in group validation mode
@@ -1916,35 +1915,52 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xlo = &excl_cntrs->states[o_tid];
 	xl = &excl_cntrs->states[tid];
 
 	xl->sched_started = true;
-	xl->num_alloc_cntrs = 0;
 	/*
 	 * lock shared state until we are done scheduling
 	 * in stop_event_scheduling()
 	 * makes scheduling appear as a transaction
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
 	raw_spin_lock(&excl_cntrs->lock);
+}
 
-	/*
-	 * save initial state of sibling thread
-	 */
-	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = cpuc->event_constraint[idx];
+	struct intel_excl_states *xl;
+	int tid = cpuc->excl_thread_id;
+
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return;
+
+	if (WARN_ON_ONCE(!excl_cntrs))
+		return;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	xl = &excl_cntrs->states[tid];
+
+	lockdep_assert_held(&excl_cntrs->lock);
+
+	if (c->flags & PERF_X86_EVENT_EXCL)
+		xl->state[cntr] = INTEL_EXCL_EXCLUSIVE;
+	else
+		xl->state[cntr] = INTEL_EXCL_SHARED;
 }
 
 static void
 intel_stop_scheduling(struct cpu_hw_events *cpuc)
 {
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
+	struct intel_excl_states *xl;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* sibling thread */
 
 	/*
 	 * nothing needed if in group validation mode
@@ -1954,17 +1970,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xlo = &excl_cntrs->states[o_tid];
 	xl = &excl_cntrs->states[tid];
 
-	/*
-	 * make new sibling thread state visible
-	 */
-	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
-
 	xl->sched_started = false;
 	/*
 	 * release shared state lock (acquired in intel_start_scheduling())
@@ -1976,12 +1986,10 @@ static struct event_constraint *
 intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 			   int idx, struct event_constraint *c)
 {
-	struct event_constraint *cx;
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xl, *xlo;
-	int is_excl, i;
+	struct intel_excl_states *xlo;
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid; /* alternate */
+	int is_excl, i;
 
 	/*
 	 * validating a group does not require
@@ -1993,34 +2001,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	/*
 	 * no exclusion needed
 	 */
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return c;
-	/*
-	 * event requires exclusive counter access
-	 * across HT threads
-	 */
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-	/*
-	 * xl = state of current HT
-	 * xlo = state of sibling HT
-	 */
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
-
-	/*
-	 * do not allow scheduling of more than max_alloc_cntrs
-	 * which is set to half the available generic counters.
-	 * this helps avoid counter starvation of sibling thread
-	 * by ensuring at most half the counters cannot be in
-	 * exclusive mode. There is not designated counters for the
-	 * limits. Any N/2 counters can be used. This helps with
-	 * events with specifix counter constraints
-	 */
-	if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
-		return &emptyconstraint;
-
-	cx = c;
 
 	/*
 	 * because we modify the constraint, we need
@@ -2031,10 +2013,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * been cloned (marked dynamic)
 	 */
 	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
-
-		/* sanity check */
-		if (idx < 0)
-			return &emptyconstraint;
+		struct event_constraint *cx;
 
 		/*
 		 * grab pre-allocated constraint entry
@@ -2045,13 +2024,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 		 * initialize dynamic constraint
 		 * with static constraint
 		 */
-		memcpy(cx, c, sizeof(*cx));
+		*cx = *c;
 
 		/*
 		 * mark constraint as dynamic, so we
 		 * can free it later on
 		 */
 		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+		c = cx;
 	}
 
 	/*
@@ -2062,6 +2042,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 */
 
 	/*
+	 * state of sibling HT
+	 */
+	xlo = &excl_cntrs->states[tid ^ 1];
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+	if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) {
+		event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT;
+		if (!cpuc->n_excl++)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1);
+	}
+
+	/*
 	 * Modify static constraint with current dynamic
 	 * state of thread
 	 *
@@ -2069,44 +2065,44 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * SHARED   : sibling counter measuring non-exclusive event
 	 * UNUSED   : sibling counter unused
 	 */
-	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+	for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) {
 		/*
 		 * exclusive event in sibling counter
 		 * our corresponding counter cannot be used
 		 * regardless of our event
 		 */
-		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
-			__clear_bit(i, cx->idxmsk);
+		if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, c->idxmsk);
 		/*
 		 * if measuring an exclusive event, sibling
 		 * measuring non-exclusive, then counter cannot
 		 * be used
 		 */
-		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
-			__clear_bit(i, cx->idxmsk);
+		if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, c->idxmsk);
 	}
 
 	/*
 	 * recompute actual bit weight for scheduling algorithm
 	 */
-	cx->weight = hweight64(cx->idxmsk64);
+	c->weight = hweight64(c->idxmsk64);
 
 	/*
 	 * if we return an empty mask, then switch
 	 * back to static empty constraint to avoid
 	 * the cost of freeing later on
 	 */
-	if (cx->weight == 0)
-		cx = &emptyconstraint;
+	if (c->weight == 0)
+		c = &emptyconstraint;
 
-	return cx;
+	return c;
 }
 
 static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
-	struct event_constraint *c1 = event->hw.constraint;
+	struct event_constraint *c1 = cpuc->event_constraint[idx];
 	struct event_constraint *c2;
 
 	/*
@@ -2132,10 +2128,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 {
 	struct hw_perf_event *hwc = &event->hw;
 	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct intel_excl_states *xlo, *xl;
-	unsigned long flags = 0; /* keep compiler happy */
 	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid;
+	struct intel_excl_states *xl;
 
 	/*
 	 * nothing needed if in group validation mode
@@ -2143,31 +2137,35 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
 	if (cpuc->is_fake)
 		return;
 
-	WARN_ON_ONCE(!excl_cntrs);
-
-	if (!excl_cntrs)
+	if (WARN_ON_ONCE(!excl_cntrs))
 		return;
 
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
+	if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) {
+		hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT;
+		if (!--cpuc->n_excl)
+			WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0);
+	}
 
 	/*
-	 * put_constraint may be called from x86_schedule_events()
-	 * which already has the lock held so here make locking
-	 * conditional
+	 * If event was actually assigned, then mark the counter state as
+	 * unused now.
 	 */
-	if (!xl->sched_started)
-		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+	if (hwc->idx >= 0) {
+		xl = &excl_cntrs->states[tid];
 
-	/*
-	 * if event was actually assigned, then mark the
-	 * counter state as unused now
-	 */
-	if (hwc->idx >= 0)
-		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+		/*
+		 * put_constraint may be called from x86_schedule_events()
+		 * which already has the lock held so here make locking
+		 * conditional.
+		 */
+		if (!xl->sched_started)
+			raw_spin_lock(&excl_cntrs->lock);
 
-	if (!xl->sched_started)
-		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+		xl->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+		if (!xl->sched_started)
+			raw_spin_unlock(&excl_cntrs->lock);
+	}
 }
 
 static void
@@ -2188,8 +2186,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
-	struct event_constraint *c = event->hw.constraint;
-
 	intel_put_shared_regs_event_constraints(cpuc, event);
 
 	/*
@@ -2197,48 +2193,8 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 	 * all events are subject to and must call the
 	 * put_excl_constraints() routine
 	 */
-	if (c && cpuc->excl_cntrs)
+	if (cpuc->excl_cntrs)
 		intel_put_excl_constraints(cpuc, event);
-
-	/* cleanup dynamic constraint */
-	if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
-		event->hw.constraint = NULL;
-}
-
-static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
-				    struct perf_event *event, int cntr)
-{
-	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
-	struct event_constraint *c = event->hw.constraint;
-	struct intel_excl_states *xlo, *xl;
-	int tid = cpuc->excl_thread_id;
-	int o_tid = 1 - tid;
-	int is_excl;
-
-	if (cpuc->is_fake || !c)
-		return;
-
-	is_excl = c->flags & PERF_X86_EVENT_EXCL;
-
-	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
-		return;
-
-	WARN_ON_ONCE(!excl_cntrs);
-
-	if (!excl_cntrs)
-		return;
-
-	xl = &excl_cntrs->states[tid];
-	xlo = &excl_cntrs->states[o_tid];
-
-	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
-
-	if (cntr >= 0) {
-		if (is_excl)
-			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
-		else
-			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
-	}
 }
 
 static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2304,8 +2260,15 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (ret)
 		return ret;
 
-	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
-		x86_pmu.pebs_aliases(event);
+	if (event->attr.precise_ip) {
+		if (!event->attr.freq) {
+			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
+			if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+		}
+		if (x86_pmu.pebs_aliases)
+			x86_pmu.pebs_aliases(event);
+	}
 
 	if (needs_branch_stack(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
@@ -2554,19 +2517,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
 {
 	struct intel_excl_cntrs *c;
-	int i;
 
 	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
 			 GFP_KERNEL, cpu_to_node(cpu));
 	if (c) {
 		raw_spin_lock_init(&c->lock);
-		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
-			c->states[0].state[i] = INTEL_EXCL_UNUSED;
-			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
-
-			c->states[1].state[i] = INTEL_EXCL_UNUSED;
-			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
-		}
 		c->core_id = -1;
 	}
 	return c;
@@ -2621,7 +2576,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
 		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
@@ -2639,9 +2594,7 @@ static void intel_pmu_cpu_starting(int cpu)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
-		int h = x86_pmu.num_counters >> 1;
-
-		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+		for_each_cpu(i, topology_sibling_cpumask(cpu)) {
 			struct intel_excl_cntrs *c;
 
 			c = per_cpu(cpu_hw_events, i).excl_cntrs;
@@ -2654,11 +2607,6 @@ static void intel_pmu_cpu_starting(int cpu)
 		}
 		cpuc->excl_cntrs->core_id = core_id;
 		cpuc->excl_cntrs->refcnt++;
-		/*
-		 * set hard limit to half the number of generic counters
-		 */
-		cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
-		cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
 	}
 }
 
@@ -2694,6 +2642,15 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
+static void intel_pmu_sched_task(struct perf_event_context *ctx,
+				 bool sched_in)
+{
+	if (x86_pmu.pebs_active)
+		intel_pmu_pebs_sched_task(ctx, sched_in);
+	if (x86_pmu.lbr_nr)
+		intel_pmu_lbr_sched_task(ctx, sched_in);
+}
+
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
 
 PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2783,7 +2740,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
-	.sched_task		= intel_pmu_lbr_sched_task,
+	.sched_task		= intel_pmu_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -2956,8 +2913,8 @@ static __init void intel_ht_bug(void)
 {
 	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
 
-	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.stop_scheduling = intel_stop_scheduling;
 }
 
@@ -3270,6 +3227,8 @@ __init int intel_pmu_init(void)
 
 	case 61: /* 14nm Broadwell Core-M */
 	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
@@ -3339,13 +3298,13 @@ __init int intel_pmu_init(void)
 		 * counter, so do not extend mask to generic counters
 		 */
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if (c->cmask != FIXED_EVENT_FLAGS
-			    || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
-				continue;
+			if (c->cmask == FIXED_EVENT_FLAGS
+			    && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) {
+				c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
 			}
-
-			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
-			c->weight += x86_pmu.num_counters;
+			c->idxmsk64 &=
+				~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
+			c->weight = hweight64(c->idxmsk64);
 		}
 	}
 
@@ -3403,7 +3362,7 @@ static __init int fixup_ht_bug(void)
 	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
 		return 0;
 
-	w = cpumask_weight(topology_thread_cpumask(cpu));
+	w = cpumask_weight(topology_sibling_cpumask(cpu));
 	if (w > 1) {
 		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
 		return 0;
@@ -3413,8 +3372,8 @@ static __init int fixup_ht_bug(void)
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
-	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.start_scheduling = NULL;
+	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
 	watchdog_nmi_enable_all();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index ac1f0c55f379..7795f3f8b1d5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -483,17 +483,26 @@ static int bts_event_add(struct perf_event *event, int mode)
 
 static void bts_event_destroy(struct perf_event *event)
 {
+	x86_release_hardware();
 	x86_del_exclusive(x86_lbr_exclusive_bts);
 }
 
 static int bts_event_init(struct perf_event *event)
 {
+	int ret;
+
 	if (event->attr.type != bts_pmu.type)
 		return -ENOENT;
 
 	if (x86_add_exclusive(x86_lbr_exclusive_bts))
 		return -EBUSY;
 
+	ret = x86_reserve_hardware();
+	if (ret) {
+		x86_del_exclusive(x86_lbr_exclusive_bts);
+		return ret;
+	}
+
 	event->destroy = bts_event_destroy;
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index e4d1b8b738fa..188076161c1b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -13,16 +13,35 @@
 #define MSR_IA32_QM_CTR		0x0c8e
 #define MSR_IA32_QM_EVTSEL	0x0c8d
 
-static unsigned int cqm_max_rmid = -1;
+static u32 cqm_max_rmid = -1;
 static unsigned int cqm_l3_scale; /* supposedly cacheline size */
 
-struct intel_cqm_state {
-	raw_spinlock_t		lock;
-	int			rmid;
-	int			cnt;
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid:		The cached Resource Monitoring ID
+ * @closid:		The cached Class Of Service ID
+ * @rmid_usecnt:	The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+	u32			rmid;
+	u32			closid;
+	int			rmid_usecnt;
 };
 
-static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 
 /*
  * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
@@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask;
  * near-zero occupancy value, i.e. no cachelines are tagged with this
  * RMID, once __intel_cqm_rmid_rotate() returns.
  */
-static unsigned int intel_cqm_rotation_rmid;
+static u32 intel_cqm_rotation_rmid;
 
 #define INVALID_RMID		(-1)
 
@@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid;
  * Likewise, an rmid value of -1 is used to indicate "no rmid currently
  * assigned" and is used as part of the rotation code.
  */
-static inline bool __rmid_valid(unsigned int rmid)
+static inline bool __rmid_valid(u32 rmid)
 {
 	if (!rmid || rmid == INVALID_RMID)
 		return false;
@@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid)
 	return true;
 }
 
-static u64 __rmid_read(unsigned int rmid)
+static u64 __rmid_read(u32 rmid)
 {
 	u64 val;
 
@@ -102,7 +121,7 @@ enum rmid_recycle_state {
 };
 
 struct cqm_rmid_entry {
-	unsigned int rmid;
+	u32 rmid;
 	enum rmid_recycle_state state;
 	struct list_head list;
 	unsigned long queue_time;
@@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru);
  */
 static struct cqm_rmid_entry **cqm_rmid_ptrs;
 
-static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
  *
  * We expect to be called with cache_mutex held.
  */
-static int __get_rmid(void)
+static u32 __get_rmid(void)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -177,7 +196,7 @@ static int __get_rmid(void)
 	return entry->rmid;
 }
 
-static void __put_rmid(unsigned int rmid)
+static void __put_rmid(u32 rmid)
 {
 	struct cqm_rmid_entry *entry;
 
@@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 }
 
 struct rmid_read {
-	unsigned int rmid;
+	u32 rmid;
 	atomic64_t value;
 };
 
@@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info);
 /*
  * Exchange the RMID of a group of events.
  */
-static unsigned int
-intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
 {
 	struct perf_event *event;
-	unsigned int old_rmid = group->hw.cqm_rmid;
 	struct list_head *head = &group->hw.cqm_group_entry;
+	u32 old_rmid = group->hw.cqm_rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg)
  * If we have group events waiting for an RMID that don't conflict with
  * events already running, assign @rmid.
  */
-static bool intel_cqm_sched_in_event(unsigned int rmid)
+static bool intel_cqm_sched_in_event(u32 rmid)
 {
 	struct perf_event *leader, *event;
 
@@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 {
 	struct perf_event *rotor;
-	unsigned int rmid;
+	u32 rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
 {
 	struct perf_event *group, *g;
-	unsigned int rmid;
+	u32 rmid;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
 				  struct perf_event **group)
 {
 	struct perf_event *iter;
-	unsigned int rmid;
 	bool conflict = false;
+	u32 rmid;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 		rmid = iter->hw.cqm_rmid;
@@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
 static void intel_cqm_event_read(struct perf_event *event)
 {
 	unsigned long flags;
-	unsigned int rmid;
+	u32 rmid;
 	u64 val;
 
 	/*
@@ -961,55 +979,48 @@ out:
 
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
-	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned int rmid = event->hw.cqm_rmid;
-	unsigned long flags;
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 rmid = event->hw.cqm_rmid;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
 		return;
 
 	event->hw.cqm_state &= ~PERF_HES_STOPPED;
 
-	raw_spin_lock_irqsave(&state->lock, flags);
-
-	if (state->cnt++)
-		WARN_ON_ONCE(state->rmid != rmid);
-	else
+	if (state->rmid_usecnt++) {
+		if (!WARN_ON_ONCE(state->rmid != rmid))
+			return;
+	} else {
 		WARN_ON_ONCE(state->rmid);
+	}
 
 	state->rmid = rmid;
-	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
 }
 
 static void intel_cqm_event_stop(struct perf_event *event, int mode)
 {
-	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned long flags;
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
 
 	if (event->hw.cqm_state & PERF_HES_STOPPED)
 		return;
 
 	event->hw.cqm_state |= PERF_HES_STOPPED;
 
-	raw_spin_lock_irqsave(&state->lock, flags);
 	intel_cqm_event_read(event);
 
-	if (!--state->cnt) {
+	if (!--state->rmid_usecnt) {
 		state->rmid = 0;
-		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
 	} else {
 		WARN_ON_ONCE(!state->rmid);
 	}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
 }
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
 {
 	unsigned long flags;
-	unsigned int rmid;
+	u32 rmid;
 
 	raw_spin_lock_irqsave(&cache_lock, flags);
 
@@ -1024,11 +1035,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
 	return 0;
 }
 
-static void intel_cqm_event_del(struct perf_event *event, int mode)
-{
-	intel_cqm_event_stop(event, mode);
-}
-
 static void intel_cqm_event_destroy(struct perf_event *event)
 {
 	struct perf_event *group_other = NULL;
@@ -1057,7 +1063,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 			list_replace(&event->hw.cqm_groups_entry,
 				     &group_other->hw.cqm_groups_entry);
 		} else {
-			unsigned int rmid = event->hw.cqm_rmid;
+			u32 rmid = event->hw.cqm_rmid;
 
 			if (__rmid_valid(rmid))
 				__put_rmid(rmid);
@@ -1221,7 +1227,7 @@ static struct pmu intel_cqm_pmu = {
 	.task_ctx_nr	     = perf_sw_context,
 	.event_init	     = intel_cqm_event_init,
 	.add		     = intel_cqm_event_add,
-	.del		     = intel_cqm_event_del,
+	.del		     = intel_cqm_event_stop,
 	.start		     = intel_cqm_event_start,
 	.stop		     = intel_cqm_event_stop,
 	.read		     = intel_cqm_event_read,
@@ -1243,12 +1249,12 @@ static inline void cqm_pick_event_reader(int cpu)
 
 static void intel_cqm_cpu_prepare(unsigned int cpu)
 {
-	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	raw_spin_lock_init(&state->lock);
 	state->rmid = 0;
-	state->cnt  = 0;
+	state->closid = 0;
+	state->rmid_usecnt = 0;
 
 	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
 	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 813f75d71175..71fc40238843 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -11,7 +11,7 @@
 #define BTS_RECORD_SIZE		24
 
 #define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	PAGE_SIZE
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 /*
@@ -250,7 +250,7 @@ static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
 	int node = cpu_to_node(cpu);
-	int max, thresh = 1; /* always use a single PEBS record */
+	int max;
 	void *buffer, *ibuffer;
 
 	if (!x86_pmu.pebs)
@@ -280,9 +280,6 @@ static int alloc_pebs_buffer(int cpu)
 	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
 		max * x86_pmu.pebs_record_size;
 
-	ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
-		thresh * x86_pmu.pebs_record_size;
-
 	return 0;
 }
 
@@ -549,6 +546,19 @@ int intel_pmu_drain_bts_buffer(void)
 	return 1;
 }
 
+static inline void intel_pmu_drain_pebs_buffer(void)
+{
+	struct pt_regs regs;
+
+	x86_pmu.drain_pebs(&regs);
+}
+
+void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (!sched_in)
+		intel_pmu_drain_pebs_buffer();
+}
+
 /*
  * PEBS
  */
@@ -684,33 +694,81 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
+static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+{
+	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+}
+
 void intel_pmu_pebs_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
+	bool first_pebs;
+	u64 threshold;
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
+	first_pebs = !pebs_is_enabled(cpuc);
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 
 	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
 		cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32);
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled |= 1ULL << 63;
+
+	/*
+	 * When the event is constrained enough we can use a larger
+	 * threshold and run the event with less frequent PMI.
+	 */
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
+		threshold = ds->pebs_absolute_maximum -
+			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+
+		if (first_pebs)
+			perf_sched_cb_inc(event->ctx->pmu);
+	} else {
+		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+		/*
+		 * If not all events can use larger buffer,
+		 * roll back to threshold = 1
+		 */
+		if (!first_pebs &&
+		    (ds->pebs_interrupt_threshold > threshold))
+			perf_sched_cb_dec(event->ctx->pmu);
+	}
+
+	/* Use auto-reload if possible to save a MSR write in the PMI */
+	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+		ds->pebs_event_reset[hwc->idx] =
+			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
+	}
+
+	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
+		ds->pebs_interrupt_threshold = threshold;
 }
 
 void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
+	struct debug_store *ds = cpuc->ds;
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
 
-	if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT)
+	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
 		cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32));
-	else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST)
+	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
 
+	if (ds->pebs_interrupt_threshold >
+	    ds->pebs_buffer_base + x86_pmu.pebs_record_size) {
+		intel_pmu_drain_pebs_buffer();
+		if (!pebs_is_enabled(cpuc))
+			perf_sched_cb_dec(event->ctx->pmu);
+	}
+
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
@@ -846,8 +904,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
 	return txn;
 }
 
-static void __intel_pmu_pebs_event(struct perf_event *event,
-				   struct pt_regs *iregs, void *__pebs)
+static void setup_pebs_sample_data(struct perf_event *event,
+				   struct pt_regs *iregs, void *__pebs,
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 #define PERF_X86_EVENT_PEBS_HSW_PREC \
 		(PERF_X86_EVENT_PEBS_ST_HSW | \
@@ -859,13 +919,11 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	 */
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct pebs_record_hsw *pebs = __pebs;
-	struct perf_sample_data data;
-	struct pt_regs regs;
 	u64 sample_type;
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
 
-	if (!intel_pmu_save_and_restart(event))
+	if (pebs == NULL)
 		return;
 
 	sample_type = event->attr.sample_type;
@@ -874,15 +932,15 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	fll = fl & PERF_X86_EVENT_PEBS_LDLAT;
 	fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
 
-	perf_sample_data_init(&data, 0, event->hw.last_period);
+	perf_sample_data_init(data, 0, event->hw.last_period);
 
-	data.period = event->hw.last_period;
+	data->period = event->hw.last_period;
 
 	/*
 	 * Use latency for weight (only avail with PEBS-LL)
 	 */
 	if (fll && (sample_type & PERF_SAMPLE_WEIGHT))
-		data.weight = pebs->lat;
+		data->weight = pebs->lat;
 
 	/*
 	 * data.data_src encodes the data source
@@ -895,7 +953,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 			val = precise_datala_hsw(event, pebs->dse);
 		else if (fst)
 			val = precise_store_data(pebs->dse);
-		data.data_src.val = val;
+		data->data_src.val = val;
 	}
 
 	/*
@@ -908,61 +966,123 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	 * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
 	 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
 	 */
-	regs = *iregs;
-	regs.flags = pebs->flags;
-	set_linear_ip(&regs, pebs->ip);
-	regs.bp = pebs->bp;
-	regs.sp = pebs->sp;
+	*regs = *iregs;
+	regs->flags = pebs->flags;
+	set_linear_ip(regs, pebs->ip);
+	regs->bp = pebs->bp;
+	regs->sp = pebs->sp;
 
 	if (sample_type & PERF_SAMPLE_REGS_INTR) {
-		regs.ax = pebs->ax;
-		regs.bx = pebs->bx;
-		regs.cx = pebs->cx;
-		regs.dx = pebs->dx;
-		regs.si = pebs->si;
-		regs.di = pebs->di;
-		regs.bp = pebs->bp;
-		regs.sp = pebs->sp;
-
-		regs.flags = pebs->flags;
+		regs->ax = pebs->ax;
+		regs->bx = pebs->bx;
+		regs->cx = pebs->cx;
+		regs->dx = pebs->dx;
+		regs->si = pebs->si;
+		regs->di = pebs->di;
+		regs->bp = pebs->bp;
+		regs->sp = pebs->sp;
+
+		regs->flags = pebs->flags;
 #ifndef CONFIG_X86_32
-		regs.r8 = pebs->r8;
-		regs.r9 = pebs->r9;
-		regs.r10 = pebs->r10;
-		regs.r11 = pebs->r11;
-		regs.r12 = pebs->r12;
-		regs.r13 = pebs->r13;
-		regs.r14 = pebs->r14;
-		regs.r15 = pebs->r15;
+		regs->r8 = pebs->r8;
+		regs->r9 = pebs->r9;
+		regs->r10 = pebs->r10;
+		regs->r11 = pebs->r11;
+		regs->r12 = pebs->r12;
+		regs->r13 = pebs->r13;
+		regs->r14 = pebs->r14;
+		regs->r15 = pebs->r15;
 #endif
 	}
 
 	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) {
-		regs.ip = pebs->real_ip;
-		regs.flags |= PERF_EFLAGS_EXACT;
-	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
-		regs.flags |= PERF_EFLAGS_EXACT;
+		regs->ip = pebs->real_ip;
+		regs->flags |= PERF_EFLAGS_EXACT;
+	} else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs))
+		regs->flags |= PERF_EFLAGS_EXACT;
 	else
-		regs.flags &= ~PERF_EFLAGS_EXACT;
+		regs->flags &= ~PERF_EFLAGS_EXACT;
 
 	if ((sample_type & PERF_SAMPLE_ADDR) &&
 	    x86_pmu.intel_cap.pebs_format >= 1)
-		data.addr = pebs->dla;
+		data->addr = pebs->dla;
 
 	if (x86_pmu.intel_cap.pebs_format >= 2) {
 		/* Only set the TSX weight when no memory weight. */
 		if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll)
-			data.weight = intel_hsw_weight(pebs);
+			data->weight = intel_hsw_weight(pebs);
 
 		if (sample_type & PERF_SAMPLE_TRANSACTION)
-			data.txn = intel_hsw_transaction(pebs);
+			data->txn = intel_hsw_transaction(pebs);
 	}
 
 	if (has_branch_stack(event))
-		data.br_stack = &cpuc->lbr_stack;
+		data->br_stack = &cpuc->lbr_stack;
+}
+
+static inline void *
+get_next_pebs_record_by_bit(void *base, void *top, int bit)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	void *at;
+	u64 pebs_status;
+
+	if (base == NULL)
+		return NULL;
+
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
+		struct pebs_record_nhm *p = at;
 
-	if (perf_event_overflow(event, &data, &regs))
+		if (test_bit(bit, (unsigned long *)&p->status)) {
+			/* PEBS v3 has accurate status bits */
+			if (x86_pmu.intel_cap.pebs_format >= 3)
+				return at;
+
+			if (p->status == (1 << bit))
+				return at;
+
+			/* clear non-PEBS bit and re-check */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status == (1 << bit))
+				return at;
+		}
+	}
+	return NULL;
+}
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+				   struct pt_regs *iregs,
+				   void *base, void *top,
+				   int bit, int count)
+{
+	struct perf_sample_data data;
+	struct pt_regs regs;
+	void *at = get_next_pebs_record_by_bit(base, top, bit);
+
+	if (!intel_pmu_save_and_restart(event) &&
+	    !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+		return;
+
+	while (count > 1) {
+		setup_pebs_sample_data(event, iregs, at, &data, &regs);
+		perf_event_output(event, &data, &regs);
+		at += x86_pmu.pebs_record_size;
+		at = get_next_pebs_record_by_bit(at, top, bit);
+		count--;
+	}
+
+	setup_pebs_sample_data(event, iregs, at, &data, &regs);
+
+	/*
+	 * All but the last records are processed.
+	 * The last one is left to be able to call the overflow handler.
+	 */
+	if (perf_event_overflow(event, &data, &regs)) {
 		x86_pmu_stop(event, 0);
+		return;
+	}
+
 }
 
 static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -992,72 +1112,99 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
 	if (!event->attr.precise_ip)
 		return;
 
-	n = top - at;
+	n = (top - at) / x86_pmu.pebs_record_size;
 	if (n <= 0)
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(n > 1, "bad leftover pebs %d\n", n);
-	at += n - 1;
-
-	__intel_pmu_pebs_event(event, iregs, at);
+	__intel_pmu_pebs_event(event, iregs, at, top, 0, n);
 }
 
 static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct debug_store *ds = cpuc->ds;
-	struct perf_event *event = NULL;
-	void *at, *top;
-	u64 status = 0;
-	int bit;
+	struct perf_event *event;
+	void *base, *at, *top;
+	short counts[MAX_PEBS_EVENTS] = {};
+	short error[MAX_PEBS_EVENTS] = {};
+	int bit, i;
 
 	if (!x86_pmu.pebs_active)
 		return;
 
-	at  = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+	base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
 	top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
 
 	ds->pebs_index = ds->pebs_buffer_base;
 
-	if (unlikely(at > top))
+	if (unlikely(base >= top))
 		return;
 
-	/*
-	 * Should not happen, we program the threshold at 1 and do not
-	 * set a reset value.
-	 */
-	WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size,
-		  "Unexpected number of pebs records %ld\n",
-		  (long)(top - at) / x86_pmu.pebs_record_size);
-
-	for (; at < top; at += x86_pmu.pebs_record_size) {
+	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
 		struct pebs_record_nhm *p = at;
 
-		for_each_set_bit(bit, (unsigned long *)&p->status,
-				 x86_pmu.max_pebs_events) {
-			event = cpuc->events[bit];
-			if (!test_bit(bit, cpuc->active_mask))
-				continue;
-
-			WARN_ON_ONCE(!event);
+		/* PEBS v3 has accurate status bits */
+		if (x86_pmu.intel_cap.pebs_format >= 3) {
+			for_each_set_bit(bit, (unsigned long *)&p->status,
+					 MAX_PEBS_EVENTS)
+				counts[bit]++;
 
-			if (!event->attr.precise_ip)
-				continue;
+			continue;
+		}
 
-			if (__test_and_set_bit(bit, (unsigned long *)&status))
+		bit = find_first_bit((unsigned long *)&p->status,
+					x86_pmu.max_pebs_events);
+		if (bit >= x86_pmu.max_pebs_events)
+			continue;
+		if (!test_bit(bit, cpuc->active_mask))
+			continue;
+		/*
+		 * The PEBS hardware does not deal well with the situation
+		 * when events happen near to each other and multiple bits
+		 * are set. But it should happen rarely.
+		 *
+		 * If these events include one PEBS and multiple non-PEBS
+		 * events, it doesn't impact PEBS record. The record will
+		 * be handled normally. (slow path)
+		 *
+		 * If these events include two or more PEBS events, the
+		 * records for the events can be collapsed into a single
+		 * one, and it's not possible to reconstruct all events
+		 * that caused the PEBS record. It's called collision.
+		 * If collision happened, the record will be dropped.
+		 *
+		 */
+		if (p->status != (1 << bit)) {
+			u64 pebs_status;
+
+			/* slow path */
+			pebs_status = p->status & cpuc->pebs_enabled;
+			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
+			if (pebs_status != (1 << bit)) {
+				for_each_set_bit(i, (unsigned long *)&pebs_status,
+						 MAX_PEBS_EVENTS)
+					error[i]++;
 				continue;
-
-			break;
+			}
 		}
+		counts[bit]++;
+	}
 
-		if (!event || bit >= x86_pmu.max_pebs_events)
+	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
+		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;
+		event = cpuc->events[bit];
+		WARN_ON_ONCE(!event);
+		WARN_ON_ONCE(!event->attr.precise_ip);
 
-		__intel_pmu_pebs_event(event, iregs, at);
+		/* log dropped samples number */
+		if (error[bit])
+			perf_log_lost_samples(event, error[bit]);
+
+		if (counts[bit]) {
+			__intel_pmu_pebs_event(event, iregs, base,
+					       top, bit, counts[bit]);
+		}
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 94e5b506caa6..452a7bd2dedb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -96,6 +96,7 @@ enum {
 	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
+	X86_BR_IND_JMP		= 1 << 17,/* indirect jump */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -113,6 +114,7 @@ enum {
 	 X86_BR_IRQ	 |\
 	 X86_BR_ABORT	 |\
 	 X86_BR_IND_CALL |\
+	 X86_BR_IND_JMP  |\
 	 X86_BR_ZERO_CALL)
 
 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
@@ -262,9 +264,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx;
 
-	if (!x86_pmu.lbr_nr)
-		return;
-
 	/*
 	 * If LBR callstack feature is enabled and the stack was saved when
 	 * the task was scheduled out, restore the stack. Otherwise flush
@@ -523,6 +522,9 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 			X86_BR_CALL_STACK;
 	}
 
+	if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+		mask |= X86_BR_IND_JMP;
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -736,7 +738,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 			break;
 		case 4:
 		case 5:
-			ret = X86_BR_JMP;
+			ret = X86_BR_IND_JMP;
 			break;
 		}
 		break;
@@ -844,6 +846,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	 */
 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
 };
 
 static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -856,6 +859,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 						| LBR_FAR,
 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
 };
 
 static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
@@ -870,6 +874,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
 						| LBR_RETURN | LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= LBR_IND_JMP,
 };
 
 /* core */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index ffe666c2c6b5..159887c3a89d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -151,7 +151,7 @@ static int __init pt_pmu_hw_init(void)
 
 		de_attr->attr.attr.name = pt_caps[i].name;
 
-		sysfs_attr_init(&de_attrs->attr.attr);
+		sysfs_attr_init(&de_attr->attr.attr);
 
 		de_attr->attr.attr.mode		= S_IRUGO;
 		de_attr->attr.show		= pt_cap_show;
@@ -187,15 +187,6 @@ static bool pt_event_valid(struct perf_event *event)
  * These all are cpu affine and operate on a local PT
  */
 
-static bool pt_is_running(void)
-{
-	u64 ctl;
-
-	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
-
-	return !!(ctl & RTIT_CTL_TRACEEN);
-}
-
 static void pt_config(struct perf_event *event)
 {
 	u64 reg;
@@ -609,16 +600,19 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
  * @handle:	Current output handle.
  *
  * Place INT and STOP marks to prevent overwriting old data that the consumer
- * hasn't yet collected.
+ * hasn't yet collected and waking up the consumer after a certain fraction of
+ * the buffer has filled up. Only needed and sensible for non-snapshot counters.
+ *
+ * This obviously relies on buf::head to figure out buffer markers, so it has
+ * to be called after pt_buffer_reset_offsets() and before the hardware tracing
+ * is enabled.
  */
 static int pt_buffer_reset_markers(struct pt_buffer *buf,
 				   struct perf_output_handle *handle)
 
 {
-	unsigned long idx, npages, end;
-
-	if (buf->snapshot)
-		return 0;
+	unsigned long head = local64_read(&buf->head);
+	unsigned long idx, npages, wakeup;
 
 	/* can't stop in the middle of an output region */
 	if (buf->output_off + handle->size + 1 <
@@ -634,17 +628,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf,
 	buf->topa_index[buf->stop_pos]->stop = 0;
 	buf->topa_index[buf->intr_pos]->intr = 0;
 
-	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
-		npages = (handle->size + 1) >> PAGE_SHIFT;
-		end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
-		/*if (end > handle->wakeup >> PAGE_SHIFT)
-		  end = handle->wakeup >> PAGE_SHIFT;*/
-		idx = end & (buf->nr_pages - 1);
-		buf->stop_pos = idx;
-		idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
-		idx &= buf->nr_pages - 1;
-		buf->intr_pos = idx;
-	}
+	/* how many pages till the STOP marker */
+	npages = handle->size >> PAGE_SHIFT;
+
+	/* if it's on a page boundary, fill up one more page */
+	if (!offset_in_page(head + handle->size + 1))
+		npages++;
+
+	idx = (head >> PAGE_SHIFT) + npages;
+	idx &= buf->nr_pages - 1;
+	buf->stop_pos = idx;
+
+	wakeup = handle->wakeup >> PAGE_SHIFT;
+
+	/* in the worst case, wake up the consumer one page before hard stop */
+	idx = (head >> PAGE_SHIFT) + npages - 1;
+	if (idx > wakeup)
+		idx = wakeup;
+
+	idx &= buf->nr_pages - 1;
+	buf->intr_pos = idx;
 
 	buf->topa_index[buf->stop_pos]->stop = 1;
 	buf->topa_index[buf->intr_pos]->intr = 1;
@@ -664,7 +667,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 	struct topa *cur = buf->first, *prev = buf->last;
 	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
 		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
-	int pg = 0, idx = 0, ntopa = 0;
+	int pg = 0, idx = 0;
 
 	while (pg < buf->nr_pages) {
 		int tidx;
@@ -679,9 +682,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
 			/* advance to next topa table */
 			idx = 0;
 			cur = list_entry(cur->list.next, struct topa, list);
-			ntopa++;
-		} else
+		} else {
 			idx++;
+		}
 		te_cur = TOPA_ENTRY(cur, idx);
 	}
 
@@ -693,7 +696,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
  * @head:	Write pointer (aux_head) from AUX buffer.
  *
  * Find the ToPA table and entry corresponding to given @head and set buffer's
- * "current" pointers accordingly.
+ * "current" pointers accordingly. This is done after we have obtained the
+ * current aux_head position from a successful call to perf_aux_output_begin()
+ * to make sure the hardware is writing to the right place.
+ *
+ * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
+ * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
+ * which are used to determine INT and STOP markers' locations by a subsequent
+ * call to pt_buffer_reset_markers().
  */
 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
 {
@@ -891,6 +901,7 @@ void intel_pt_interrupt(void)
 		}
 
 		pt_buffer_reset_offsets(buf, pt->handle.head);
+		/* snapshot counters don't use PMI, so it's safe */
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
 		if (ret) {
 			perf_aux_output_end(&pt->handle, 0, true);
@@ -913,7 +924,7 @@ static void pt_event_start(struct perf_event *event, int mode)
 	struct pt *pt = this_cpu_ptr(&pt_ctx);
 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
-	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+	if (!buf || pt_buffer_is_full(buf, pt)) {
 		event->hw.state = PERF_HES_STOPPED;
 		return;
 	}
@@ -944,7 +955,6 @@ static void pt_event_stop(struct perf_event *event, int mode)
 	event->hw.state = PERF_HES_STOPPED;
 
 	if (mode & PERF_EF_UPDATE) {
-		struct pt *pt = this_cpu_ptr(&pt_ctx);
 		struct pt_buffer *buf = perf_get_aux(&pt->handle);
 
 		if (!buf)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index 358c54ad20d4..5cbd4e64feb5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -204,9 +204,8 @@ again:
 
 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
 {
-	__hrtimer_start_range_ns(&pmu->hrtimer,
-			pmu->timer_interval, 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
+		     HRTIMER_MODE_REL_PINNED);
 }
 
 static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index c635b8b49e93..21b5e38c921b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -233,9 +233,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
 
 void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
 {
-	__hrtimer_start_range_ns(&box->hrtimer,
-			ns_to_ktime(box->hrtimer_duration), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
@@ -365,9 +364,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
 
 	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
-		hwc = &box->event_list[i]->hw;
 		c = uncore_get_event_constraint(box, box->event_list[i]);
-		hwc->constraint = c;
+		box->event_constraint[i] = c;
 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
 	}
@@ -375,7 +373,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	/* fastpath, try to reuse previous register */
 	for (i = 0; i < n; i++) {
 		hwc = &box->event_list[i]->hw;
-		c = hwc->constraint;
+		c = box->event_constraint[i];
 
 		/* never assigned */
 		if (hwc->idx == -1)
@@ -395,8 +393,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	}
 	/* slow path */
 	if (i != n)
-		ret = perf_assign_events(box->event_list, n,
-					 wmin, wmax, assign);
+		ret = perf_assign_events(box->event_constraint, n,
+					 wmin, wmax, n, assign);
 
 	if (!assign || ret) {
 		for (i = 0; i < n; i++)
@@ -840,6 +838,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id
 	box->phys_id = phys_id;
 	box->pci_dev = pdev;
 	box->pmu = pmu;
+	uncore_box_init(box);
 	pci_set_drvdata(pdev, box);
 
 	raw_spin_lock(&uncore_box_lock);
@@ -922,6 +921,9 @@ static int __init uncore_pci_init(void)
 	case 69: /* Haswell Celeron */
 		ret = hsw_uncore_pci_init();
 		break;
+	case 61: /* Broadwell */
+		ret = bdw_uncore_pci_init();
+		break;
 	default:
 		return 0;
 	}
@@ -1003,8 +1005,10 @@ static int uncore_cpu_starting(int cpu)
 			pmu = &type->pmus[j];
 			box = *per_cpu_ptr(pmu->box, cpu);
 			/* called by uncore_cpu_init? */
-			if (box && box->phys_id >= 0)
+			if (box && box->phys_id >= 0) {
+				uncore_box_init(box);
 				continue;
+			}
 
 			for_each_online_cpu(k) {
 				exist = *per_cpu_ptr(pmu->box, k);
@@ -1020,8 +1024,10 @@ static int uncore_cpu_starting(int cpu)
 				}
 			}
 
-			if (box)
+			if (box) {
 				box->phys_id = phys_id;
+				uncore_box_init(box);
+			}
 		}
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 6c8c1e7e69d8..0f77f0a196e4 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -97,6 +97,7 @@ struct intel_uncore_box {
 	atomic_t refcnt;
 	struct perf_event *events[UNCORE_PMC_IDX_MAX];
 	struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
+	struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX];
 	unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
 	u64 tags[UNCORE_PMC_IDX_MAX];
 	struct pci_dev *pci_dev;
@@ -257,14 +258,6 @@ static inline int uncore_num_counters(struct intel_uncore_box *box)
 	return box->pmu->type->num_counters;
 }
 
-static inline void uncore_box_init(struct intel_uncore_box *box)
-{
-	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
-		if (box->pmu->type->ops->init_box)
-			box->pmu->type->ops->init_box(box);
-	}
-}
-
 static inline void uncore_disable_box(struct intel_uncore_box *box)
 {
 	if (box->pmu->type->ops->disable_box)
@@ -273,8 +266,6 @@ static inline void uncore_disable_box(struct intel_uncore_box *box)
 
 static inline void uncore_enable_box(struct intel_uncore_box *box)
 {
-	uncore_box_init(box);
-
 	if (box->pmu->type->ops->enable_box)
 		box->pmu->type->ops->enable_box(box);
 }
@@ -297,6 +288,14 @@ static inline u64 uncore_read_counter(struct intel_uncore_box *box,
 	return box->pmu->type->ops->read_counter(box, event);
 }
 
+static inline void uncore_box_init(struct intel_uncore_box *box)
+{
+	if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
+		if (box->pmu->type->ops->init_box)
+			box->pmu->type->ops->init_box(box);
+	}
+}
+
 static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
 {
 	return (box->phys_id < 0);
@@ -326,6 +325,7 @@ extern struct event_constraint uncore_constraint_empty;
 int snb_uncore_pci_init(void);
 int ivb_uncore_pci_init(void);
 int hsw_uncore_pci_init(void);
+int bdw_uncore_pci_init(void);
 void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 4562e9e22c60..b005a78c7012 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -7,6 +7,7 @@
 #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC	0x0150
 #define PCI_DEVICE_ID_INTEL_HSW_IMC	0x0c00
 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC	0x0a04
+#define PCI_DEVICE_ID_INTEL_BDW_IMC	0x1604
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK			0x000000ff
@@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
 	{ /* end: all zeroes */ },
 };
 
+static const struct pci_device_id bdw_uncore_pci_ids[] = {
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* end: all zeroes */ },
+};
+
 static struct pci_driver snb_uncore_pci_driver = {
 	.name		= "snb_uncore",
 	.id_table	= snb_uncore_pci_ids,
@@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = {
 	.id_table	= hsw_uncore_pci_ids,
 };
 
+static struct pci_driver bdw_uncore_pci_driver = {
+	.name		= "bdw_uncore",
+	.id_table	= bdw_uncore_pci_ids,
+};
+
 struct imc_uncore_pci_dev {
 	__u32 pci_id;
 	struct pci_driver *driver;
@@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
 	IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
 	IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver),    /* 4th Gen Core Processor */
 	IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver),  /* 4th Gen Core ULT Mobile Processor */
+	IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver),    /* 5th Gen Core U */
 	{  /* end marker */ }
 };
 
@@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void)
 	return imc_uncore_pci_init();
 }
 
+int bdw_uncore_pci_init(void)
+{
+	return imc_uncore_pci_init();
+}
+
 /* end of Sandy Bridge uncore support */
 
 /* Nehalem uncore support */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 12d9548457e7..6d6e85dd5849 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -164,8 +164,8 @@
 				((1ULL << (n)) - 1)))
 
 /* Haswell-EP Ubox */
-#define HSWEP_U_MSR_PMON_CTR0			0x705
-#define HSWEP_U_MSR_PMON_CTL0			0x709
+#define HSWEP_U_MSR_PMON_CTR0			0x709
+#define HSWEP_U_MSR_PMON_CTL0			0x705
 #define HSWEP_U_MSR_PMON_FILTER			0x707
 
 #define HSWEP_U_MSR_PMON_UCLK_FIXED_CTL		0x703
@@ -1914,7 +1914,7 @@ static struct intel_uncore_type hswep_uncore_cbox = {
 	.name			= "cbox",
 	.num_counters		= 4,
 	.num_boxes		= 18,
-	.perf_ctr_bits		= 44,
+	.perf_ctr_bits		= 48,
 	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
 	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
 	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index e7d8c7608471..18ca99f2798b 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -12,7 +12,8 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
 {
 #ifdef CONFIG_SMP
 	seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
-	seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu)));
+	seq_printf(m, "siblings\t: %d\n",
+		   cpumask_weight(topology_core_cpumask(cpu)));
 	seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
 	seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
 	seq_printf(m, "apicid\t\t: %d\n", c->apicid);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c76d3e37c6e1..e068d6683dba 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -22,6 +22,7 @@
 #include <linux/elfcore.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <asm/processor.h>
 #include <asm/hardirq.h>
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 6367a780cc8c..5ee771859b6f 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -4,7 +4,6 @@
 #include <linux/bootmem.h>
 #include <linux/export.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/of.h>
@@ -17,6 +16,7 @@
 #include <linux/of_pci.h>
 #include <linux/initrd.h>
 
+#include <asm/irqdomain.h>
 #include <asm/hpet.h>
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
@@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] =
 	},
 };
 
-static int ioapic_xlate(struct irq_domain *domain,
-			struct device_node *controller,
-			const u32 *intspec, u32 intsize,
-			irq_hw_number_t *out_hwirq, u32 *out_type)
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
+	struct of_phandle_args *irq_data = (void *)arg;
 	struct of_ioapic_type *it;
-	u32 line, idx, gsi;
+	struct irq_alloc_info tmp;
 
-	if (WARN_ON(intsize < 2))
+	if (WARN_ON(irq_data->args_count < 2))
 		return -EINVAL;
-
-	line = intspec[0];
-
-	if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
+	if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type))
 		return -EINVAL;
 
-	it = &of_ioapic_type[intspec[1]];
+	it = &of_ioapic_type[irq_data->args[1]];
+	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+	tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+	tmp.ioapic_pin = irq_data->args[0];
 
-	idx = (u32)(long)domain->host_data;
-	gsi = mp_pin_to_gsi(idx, line);
-	if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
-		return -EBUSY;
-
-	*out_hwirq = line;
-	*out_type = it->out_type;
-	return 0;
+	return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }
 
-const struct irq_domain_ops ioapic_irq_domain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-	.xlate = ioapic_xlate,
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+	.alloc		= dt_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
 };
 
 static void __init dtb_add_ioapic(struct device_node *dn)
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index fe9f0b79a18b..5cb9a4d6f623 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = {
 	{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
 	  QFLAG_APPLY_ONCE, intel_graphics_stolen },
 	/*
-	 * HPET on current version of Baytrail platform has accuracy
-	 * problems, disable it for now:
+	 * HPET on the current version of the Baytrail platform has accuracy
+	 * problems: it will halt in deep idle state - so we disable it.
+	 *
+	 * More details can be found in section 18.10.1.3 of the datasheet:
+	 *
+	 *    http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
 	 */
 	{ PCI_VENDOR_ID_INTEL, 0x0f00,
 		PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
deleted file mode 100644
index 1c309763e321..000000000000
--- a/arch/x86/kernel/entry_32.S
+++ /dev/null
@@ -1,1401 +0,0 @@
-/*
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- * 	ptrace needs to have all regs on the stack.
- *	if the order here is changed, it needs to be
- *	updated in fork.c:copy_process, signal.c:do_signal,
- *	ptrace.c and ptrace.h
- *
- *	 0(%esp) - %ebx
- *	 4(%esp) - %ecx
- *	 8(%esp) - %edx
- *       C(%esp) - %esi
- *	10(%esp) - %edi
- *	14(%esp) - %ebp
- *	18(%esp) - %eax
- *	1C(%esp) - %ds
- *	20(%esp) - %es
- *	24(%esp) - %fs
- *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
- *	2C(%esp) - orig_eax
- *	30(%esp) - %eip
- *	34(%esp) - %cs
- *	38(%esp) - %eflags
- *	3C(%esp) - %oldesp
- *	40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/err.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page_types.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
-#include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
-#include <asm/alternative-asm.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE	   0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit	syscall_trace_entry
-#define sysexit_audit	syscall_exit_work
-#endif
-
-	.section .entry.text, "ax"
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization.  The following will never clobber any registers:
- *   INTERRUPT_RETURN (aka. "iret")
- *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- *   ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel		restore_all
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)     # interrupts off?
-	jz 1f
-	TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc.  Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
-	pushl_cfi $0
-.endm
-.macro POP_GS pop=0
-	addl $(4 + \pop), %esp
-	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else	/* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
-	pushl_cfi %gs
-	/*CFI_REL_OFFSET gs, 0*/
-.endm
-
-.macro POP_GS pop=0
-98:	popl_cfi %gs
-	/*CFI_RESTORE gs*/
-  .if \pop <> 0
-	add $\pop, %esp
-	CFI_ADJUST_CFA_OFFSET -\pop
-  .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99:	movl $0, (%esp)
-	jmp 98b
-.popsection
-	_ASM_EXTABLE(98b,99b)
-.endm
-
-.macro PTGS_TO_GS
-98:	mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99:	movl $0, PT_GS(%esp)
-	jmp 98b
-.popsection
-	_ASM_EXTABLE(98b,99b)
-.endm
-
-.macro GS_TO_REG reg
-	movl %gs, \reg
-	/*CFI_REGISTER gs, \reg*/
-.endm
-.macro REG_TO_PTGS reg
-	movl \reg, PT_GS(%esp)
-	/*CFI_REL_OFFSET gs, PT_GS*/
-.endm
-.macro SET_KERNEL_GS reg
-	movl $(__KERNEL_STACK_CANARY), \reg
-	movl \reg, %gs
-.endm
-
-#endif	/* CONFIG_X86_32_LAZY_GS */
-
-.macro SAVE_ALL
-	cld
-	PUSH_GS
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0;*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0;*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0;*/
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
-	movl $(__USER_DS), %edx
-	movl %edx, %ds
-	movl %edx, %es
-	movl $(__KERNEL_PERCPU), %edx
-	movl %edx, %fs
-	SET_KERNEL_GS %edx
-.endm
-
-.macro RESTORE_INT_REGS
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebp
-	CFI_RESTORE ebp
-	popl_cfi %eax
-	CFI_RESTORE eax
-.endm
-
-.macro RESTORE_REGS pop=0
-	RESTORE_INT_REGS
-1:	popl_cfi %ds
-	/*CFI_RESTORE ds;*/
-2:	popl_cfi %es
-	/*CFI_RESTORE es;*/
-3:	popl_cfi %fs
-	/*CFI_RESTORE fs;*/
-	POP_GS \pop
-.pushsection .fixup, "ax"
-4:	movl $0, (%esp)
-	jmp 1b
-5:	movl $0, (%esp)
-	jmp 2b
-6:	movl $0, (%esp)
-	jmp 3b
-.popsection
-	_ASM_EXTABLE(1b,4b)
-	_ASM_EXTABLE(2b,5b)
-	_ASM_EXTABLE(3b,6b)
-	POP_GS_EX
-.endm
-
-.macro RING0_INT_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 3*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 4*4
-	/*CFI_OFFSET cs, -2*4;*/
-	CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
-	/*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
-	CFI_OFFSET eip, PT_EIP-PT_OLDESP
-	/*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
-	/*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
-	CFI_OFFSET eax, PT_EAX-PT_OLDESP
-	CFI_OFFSET ebp, PT_EBP-PT_OLDESP
-	CFI_OFFSET edi, PT_EDI-PT_OLDESP
-	CFI_OFFSET esi, PT_ESI-PT_OLDESP
-	CFI_OFFSET edx, PT_EDX-PT_OLDESP
-	CFI_OFFSET ecx, PT_ECX-PT_OLDESP
-	CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
-ENTRY(ret_from_fork)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	call schedule_tail
-	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
-	jmp syscall_exit
-	CFI_ENDPROC
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	call schedule_tail
-	GET_THREAD_INFO(%ebp)
-	popl_cfi %eax
-	pushl_cfi $0x0202		# Reset kernel eflags
-	popfl_cfi
-	movl PT_EBP(%esp),%eax
-	call *PT_EBX(%esp)
-	movl $0,PT_EAX(%esp)
-	jmp syscall_exit
-	CFI_ENDPROC
-ENDPROC(ret_from_kernel_thread)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
-	# userspace resumption stub bypassing syscall exit tracing
-	ALIGN
-	RING0_PTREGS_FRAME
-ret_from_exception:
-	preempt_stop(CLBR_ANY)
-ret_from_intr:
-	GET_THREAD_INFO(%ebp)
-#ifdef CONFIG_VM86
-	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
-	movb PT_CS(%esp), %al
-	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
-	/*
-	 * We can be coming here from child spawned by kernel_thread().
-	 */
-	movl PT_CS(%esp), %eax
-	andl $SEGMENT_RPL_MASK, %eax
-#endif
-	cmpl $USER_RPL, %eax
-	jb resume_kernel		# not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
-	LOCKDEP_SYS_EXIT
- 	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done on
-					# int/exception return?
-	jne work_pending
-	jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
-	DISABLE_INTERRUPTS(CLBR_ANY)
-need_resched:
-	cmpl $0,PER_CPU_VAR(__preempt_count)
-	jnz restore_all
-	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
-	jz restore_all
-	call preempt_schedule_irq
-	jmp need_resched
-END(resume_kernel)
-#endif
-	CFI_ENDPROC
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
-   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
-
-	# sysenter call handler stub
-ENTRY(ia32_sysenter_target)
-	CFI_STARTPROC simple
-	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA esp, 0
-	CFI_REGISTER esp, ebp
-	movl TSS_sysenter_sp0(%esp),%esp
-sysenter_past_esp:
-	/*
-	 * Interrupts are disabled here, but we can't trace it until
-	 * enough kernel state to call TRACE_IRQS_OFF can be called - but
-	 * we immediately enable interrupts at that point anyway.
-	 */
-	pushl_cfi $__USER_DS
-	/*CFI_REL_OFFSET ss, 0*/
-	pushl_cfi %ebp
-	CFI_REL_OFFSET esp, 0
-	pushfl_cfi
-	orl $X86_EFLAGS_IF, (%esp)
-	pushl_cfi $__USER_CS
-	/*CFI_REL_OFFSET cs, 0*/
-	/*
-	 * Push current_thread_info()->sysenter_return to the stack.
-	 * A tiny bit of offset fixup is necessary: TI_sysenter_return
-	 * is relative to thread_info, which is at the bottom of the
-	 * kernel stack page.  4*4 means the 4 words pushed above;
-	 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
-	 * and THREAD_SIZE takes us to the bottom.
-	 */
-	pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
-	CFI_REL_OFFSET eip, 0
-
-	pushl_cfi %eax
-	SAVE_ALL
-	ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
-	cmpl $__PAGE_OFFSET-3,%ebp
-	jae syscall_fault
-	ASM_STAC
-1:	movl (%ebp),%ebp
-	ASM_CLAC
-	movl %ebp,PT_EBP(%esp)
-	_ASM_EXTABLE(1b,syscall_fault)
-
-	GET_THREAD_INFO(%ebp)
-
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz sysenter_audit
-sysenter_do_call:
-	cmpl $(NR_syscalls), %eax
-	jae sysenter_badsys
-	call *sys_call_table(,%eax,4)
-sysenter_after_call:
-	movl %eax,PT_EAX(%esp)
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $_TIF_ALLWORK_MASK, %ecx
-	jnz sysexit_audit
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
-	movl PT_EIP(%esp), %edx
-	movl PT_OLDESP(%esp), %ecx
-	xorl %ebp,%ebp
-	TRACE_IRQS_ON
-1:	mov  PT_FS(%esp), %fs
-	PTGS_TO_GS
-	ENABLE_INTERRUPTS_SYSEXIT
-
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
-	jnz syscall_trace_entry
-	/* movl PT_EAX(%esp), %eax	already set, syscall number: 1st arg to audit */
-	movl PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
-	/* movl PT_ECX(%esp), %ecx	already set, a1: 3nd arg to audit */
-	pushl_cfi PT_ESI(%esp)		/* a3: 5th arg */
-	pushl_cfi PT_EDX+4(%esp)	/* a2: 4th arg */
-	call __audit_syscall_entry
-	popl_cfi %ecx /* get that remapped edx off the stack */
-	popl_cfi %ecx /* get that remapped esi off the stack */
-	movl PT_EAX(%esp),%eax		/* reload syscall number */
-	jmp sysenter_do_call
-
-sysexit_audit:
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz syscall_exit_work
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	movl %eax,%edx		/* second arg, syscall return value */
-	cmpl $-MAX_ERRNO,%eax	/* is it an error ? */
-	setbe %al		/* 1 if so, 0 if not */
-	movzbl %al,%eax		/* zero-extend that */
-	call __audit_syscall_exit
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz syscall_exit_work
-	movl PT_EAX(%esp),%eax	/* reload syscall return value */
-	jmp sysenter_exit
-#endif
-
-	CFI_ENDPROC
-.pushsection .fixup,"ax"
-2:	movl $0,PT_FS(%esp)
-	jmp 1b
-.popsection
-	_ASM_EXTABLE(1b,2b)
-	PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
-
-	# system call handler stub
-ENTRY(system_call)
-	RING0_INT_FRAME			# can't unwind into user space anyway
-	ASM_CLAC
-	pushl_cfi %eax			# save orig_eax
-	SAVE_ALL
-	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation / emulation
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
-	jnz syscall_trace_entry
-	cmpl $(NR_syscalls), %eax
-	jae syscall_badsys
-syscall_call:
-	call *sys_call_table(,%eax,4)
-syscall_after_call:
-	movl %eax,PT_EAX(%esp)		# store the return value
-syscall_exit:
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
-	jnz syscall_exit_work
-
-restore_all:
-	TRACE_IRQS_IRET
-restore_all_notrace:
-#ifdef CONFIG_X86_ESPFIX32
-	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
-	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
-	# are returning to the kernel.
-	# See comments in process.c:copy_thread() for details.
-	movb PT_OLDSS(%esp), %ah
-	movb PT_CS(%esp), %al
-	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
-	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
-	CFI_REMEMBER_STATE
-	je ldt_ss			# returning to user-space with LDT SS
-#endif
-restore_nocheck:
-	RESTORE_REGS 4			# skip orig_eax/error_code
-irq_return:
-	INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
-	pushl $0			# no error code
-	pushl $do_iret_error
-	jmp error_code
-.previous
-	_ASM_EXTABLE(irq_return,iret_exc)
-
-#ifdef CONFIG_X86_ESPFIX32
-	CFI_RESTORE_STATE
-ldt_ss:
-#ifdef CONFIG_PARAVIRT
-	/*
-	 * The kernel can't run on a non-flat stack if paravirt mode
-	 * is active.  Rather than try to fixup the high bits of
-	 * ESP, bypass this code entirely.  This may break DOSemu
-	 * and/or Wine support in a paravirt VM, although the option
-	 * is still available to implement the setting of the high
-	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
-	 */
-	cmpl $0, pv_info+PARAVIRT_enabled
-	jne restore_nocheck
-#endif
-
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
-	mov %esp, %edx			/* load kernel esp */
-	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
-	mov %dx, %ax			/* eax: new kernel esp */
-	sub %eax, %edx			/* offset (low word is 0) */
-	shr $16, %edx
-	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
-	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
-	pushl_cfi $__ESPFIX_SS
-	pushl_cfi %eax			/* new kernel esp */
-	/* Disable interrupts, but do not irqtrace this section: we
-	 * will soon execute iret and the tracer was already set to
-	 * the irqstate after the iret */
-	DISABLE_INTERRUPTS(CLBR_EAX)
-	lss (%esp), %esp		/* switch to espfix segment */
-	CFI_ADJUST_CFA_OFFSET -8
-	jmp restore_nocheck
-#endif
-	CFI_ENDPROC
-ENDPROC(system_call)
-
-	# perform work that needs to be done immediately before resumption
-	ALIGN
-	RING0_PTREGS_FRAME		# can't unwind into user space anyway
-work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
-	jz work_notifysig
-work_resched:
-	call schedule
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl TI_flags(%ebp), %ecx
-	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
-					# than syscall tracing?
-	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
-	jnz work_resched
-
-work_notifysig:				# deal with pending signals and
-					# notify-resume requests
-#ifdef CONFIG_VM86
-	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl %esp, %eax
-	jnz work_notifysig_v86		# returning to kernel-space or
-					# vm86-space
-1:
-#else
-	movl %esp, %eax
-#endif
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb PT_CS(%esp), %bl
-	andb $SEGMENT_RPL_MASK, %bl
-	cmpb $USER_RPL, %bl
-	jb resume_kernel
-	xorl %edx, %edx
-	call do_notify_resume
-	jmp resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl_cfi %ecx			# save ti_flags for do_notify_resume
-	call save_v86_state		# %eax contains pt_regs pointer
-	popl_cfi %ecx
-	movl %eax, %esp
-	jmp 1b
-#endif
-END(work_pending)
-
-	# perform syscall exit tracing
-	ALIGN
-syscall_trace_entry:
-	movl $-ENOSYS,PT_EAX(%esp)
-	movl %esp, %eax
-	call syscall_trace_enter
-	/* What it returned is what we'll actually use.  */
-	cmpl $(NR_syscalls), %eax
-	jnae syscall_call
-	jmp syscall_exit
-END(syscall_trace_entry)
-
-	# perform syscall exit tracing
-	ALIGN
-syscall_exit_work:
-	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
-	jz work_pending
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
-					# schedule() instead
-	movl %esp, %eax
-	call syscall_trace_leave
-	jmp resume_userspace
-END(syscall_exit_work)
-	CFI_ENDPROC
-
-	RING0_INT_FRAME			# can't unwind into user space anyway
-syscall_fault:
-	ASM_CLAC
-	GET_THREAD_INFO(%ebp)
-	movl $-EFAULT,PT_EAX(%esp)
-	jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
-	movl $-ENOSYS,%eax
-	jmp syscall_after_call
-END(syscall_badsys)
-
-sysenter_badsys:
-	movl $-ENOSYS,%eax
-	jmp sysenter_after_call
-END(sysenter_badsys)
-	CFI_ENDPROC
-
-.macro FIXUP_ESPFIX_STACK
-/*
- * Switch back for ESPFIX stack to the normal zerobased stack
- *
- * We can't call C functions using the ESPFIX stack. This code reads
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
-#ifdef CONFIG_X86_ESPFIX32
-	/* fixup the stack */
-	mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
-	mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
-	shl $16, %eax
-	addl %esp, %eax			/* the adjusted stack pointer */
-	pushl_cfi $__KERNEL_DS
-	pushl_cfi %eax
-	lss (%esp), %esp		/* switch to the normal stack segment */
-	CFI_ADJUST_CFA_OFFSET -8
-#endif
-.endm
-.macro UNWIND_ESPFIX_STACK
-#ifdef CONFIG_X86_ESPFIX32
-	movl %ss, %eax
-	/* see if on espfix stack */
-	cmpw $__ESPFIX_SS, %ax
-	jne 27f
-	movl $__KERNEL_DS, %eax
-	movl %eax, %ds
-	movl %eax, %es
-	/* switch to normal stack */
-	FIXUP_ESPFIX_STACK
-27:
-#endif
-.endm
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
-	.align 8
-ENTRY(irq_entries_start)
-	RING0_INT_FRAME
-    vector=FIRST_EXTERNAL_VECTOR
-    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
-	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
-    vector=vector+1
-	jmp	common_interrupt
-	CFI_ADJUST_CFA_OFFSET -4
-	.align	8
-    .endr
-END(irq_entries_start)
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
-	.p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
-	ASM_CLAC
-	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	movl %esp,%eax
-	call do_IRQ
-	jmp ret_from_intr
-ENDPROC(common_interrupt)
-	CFI_ENDPROC
-
-#define BUILD_INTERRUPT3(name, nr, fn)	\
-ENTRY(name)				\
-	RING0_INT_FRAME;		\
-	ASM_CLAC;			\
-	pushl_cfi $~(nr);		\
-	SAVE_ALL;			\
-	TRACE_IRQS_OFF			\
-	movl %esp,%eax;			\
-	call fn;			\
-	jmp ret_from_intr;		\
-	CFI_ENDPROC;			\
-ENDPROC(name)
-
-
-#ifdef CONFIG_TRACING
-#define TRACE_BUILD_INTERRUPT(name, nr)		\
-	BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-#define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-
-#define BUILD_INTERRUPT(name, nr) \
-	BUILD_INTERRUPT3(name, nr, smp_##name); \
-	TRACE_BUILD_INTERRUPT(name, nr)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-ENTRY(coprocessor_error)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_error
-	jmp error_code
-	CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-#ifdef CONFIG_X86_INVD_BUG
-	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-	ALTERNATIVE "pushl_cfi $do_general_protection",	\
-		    "pushl $do_simd_coprocessor_error", \
-		    X86_FEATURE_XMM
-#else
-	pushl_cfi $do_simd_coprocessor_error
-#endif
-	jmp error_code
-	CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
-	pushl_cfi $do_device_not_available
-	jmp error_code
-	CFI_ENDPROC
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
-	iret
-	_ASM_EXTABLE(native_iret, iret_exc)
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
-	sti
-	sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-ENTRY(overflow)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_overflow
-	jmp error_code
-	CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_bounds
-	jmp error_code
-	CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_invalid_op
-	jmp error_code
-	CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_coprocessor_segment_overrun
-	jmp error_code
-	CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_invalid_TSS
-	jmp error_code
-	CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_segment_not_present
-	jmp error_code
-	CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_stack_segment
-	jmp error_code
-	CFI_ENDPROC
-END(stack_segment)
-
-ENTRY(alignment_check)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_alignment_check
-	jmp error_code
-	CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0			# no error code
-	pushl_cfi $do_divide_error
-	jmp error_code
-	CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi machine_check_vector
-	jmp error_code
-	CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $0
-	pushl_cfi $do_spurious_interrupt_bug
-	jmp error_code
-	CFI_ENDPROC
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_XEN
-/* Xen doesn't set %esp to be precisely what the normal sysenter
-   entrypoint expects, so fix it up before using the normal path. */
-ENTRY(xen_sysenter_target)
-	RING0_INT_FRAME
-	addl $5*4, %esp		/* remove xen-provided frame */
-	CFI_ADJUST_CFA_OFFSET -5*4
-	jmp sysenter_past_esp
-	CFI_ENDPROC
-
-ENTRY(xen_hypervisor_callback)
-	CFI_STARTPROC
-	pushl_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
-	TRACE_IRQS_OFF
-
-	/* Check to see if we got the event in the critical
-	   region in xen_iret_direct, after we've reenabled
-	   events and checked for pending events.  This simulates
-	   iret instruction's behaviour where it delivers a
-	   pending interrupt when enabling interrupts. */
-	movl PT_EIP(%esp),%eax
-	cmpl $xen_iret_start_crit,%eax
-	jb   1f
-	cmpl $xen_iret_end_crit,%eax
-	jae  1f
-
-	jmp  xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1:	mov %esp, %eax
-	call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
-	call xen_maybe_preempt_hcall
-#endif
-	jmp  ret_from_intr
-	CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-#  1. Fault while reloading DS, ES, FS or GS
-#  2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
-	CFI_STARTPROC
-	pushl_cfi %eax
-	movl $1,%eax
-1:	mov 4(%esp),%ds
-2:	mov 8(%esp),%es
-3:	mov 12(%esp),%fs
-4:	mov 16(%esp),%gs
-	/* EAX == 0 => Category 1 (Bad segment)
-	   EAX != 0 => Category 2 (Bad IRET) */
-	testl %eax,%eax
-	popl_cfi %eax
-	lea 16(%esp),%esp
-	CFI_ADJUST_CFA_OFFSET -16
-	jz 5f
-	jmp iret_exc
-5:	pushl_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
-	jmp ret_from_exception
-	CFI_ENDPROC
-
-.section .fixup,"ax"
-6:	xorl %eax,%eax
-	movl %eax,4(%esp)
-	jmp 1b
-7:	xorl %eax,%eax
-	movl %eax,8(%esp)
-	jmp 2b
-8:	xorl %eax,%eax
-	movl %eax,12(%esp)
-	jmp 3b
-9:	xorl %eax,%eax
-	movl %eax,16(%esp)
-	jmp 4b
-.previous
-	_ASM_EXTABLE(1b,6b)
-	_ASM_EXTABLE(2b,7b)
-	_ASM_EXTABLE(3b,8b)
-	_ASM_EXTABLE(4b,9b)
-ENDPROC(xen_failsafe_callback)
-
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
-		xen_evtchn_do_upcall)
-
-#endif	/* CONFIG_XEN */
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
-	hyperv_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
-	ret
-END(mcount)
-
-ENTRY(ftrace_caller)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	pushl $0	/* Pass NULL as regs pointer */
-	movl 4*4(%esp), %eax
-	movl 0x4(%ebp), %edx
-	movl function_trace_op, %ecx
-	subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
-	call ftrace_stub
-
-	addl $4,%esp	/* skip NULL pointer */
-	popl %edx
-	popl %ecx
-	popl %eax
-ftrace_ret:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
-	jmp ftrace_stub
-#endif
-
-.globl ftrace_stub
-ftrace_stub:
-	ret
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
-	pushf	/* push flags before compare (in cs location) */
-
-	/*
-	 * i386 does not save SS and ESP when coming from kernel.
-	 * Instead, to get sp, &regs->sp is used (see ptrace.h).
-	 * Unfortunately, that means eflags must be at the same location
-	 * as the current return ip is. We move the return ip into the
-	 * ip location, and move flags into the return ip location.
-	 */
-	pushl 4(%esp)	/* save return ip into ip slot */
-
-	pushl $0	/* Load 0 into orig_ax */
-	pushl %gs
-	pushl %fs
-	pushl %es
-	pushl %ds
-	pushl %eax
-	pushl %ebp
-	pushl %edi
-	pushl %esi
-	pushl %edx
-	pushl %ecx
-	pushl %ebx
-
-	movl 13*4(%esp), %eax	/* Get the saved flags */
-	movl %eax, 14*4(%esp)	/* Move saved flags into regs->flags location */
-				/* clobbering return ip */
-	movl $__KERNEL_CS,13*4(%esp)
-
-	movl 12*4(%esp), %eax	/* Load ip (1st parameter) */
-	subl $MCOUNT_INSN_SIZE, %eax	/* Adjust ip */
-	movl 0x4(%ebp), %edx	/* Load parent ip (2nd parameter) */
-	movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
-	pushl %esp		/* Save pt_regs as 4th parameter */
-
-GLOBAL(ftrace_regs_call)
-	call ftrace_stub
-
-	addl $4, %esp		/* Skip pt_regs */
-	movl 14*4(%esp), %eax	/* Move flags back into cs */
-	movl %eax, 13*4(%esp)	/* Needed to keep addl from modifying flags */
-	movl 12*4(%esp), %eax	/* Get return ip from regs->ip */
-	movl %eax, 14*4(%esp)	/* Put return ip back for ret */
-
-	popl %ebx
-	popl %ecx
-	popl %edx
-	popl %esi
-	popl %edi
-	popl %ebp
-	popl %eax
-	popl %ds
-	popl %es
-	popl %fs
-	popl %gs
-	addl $8, %esp		/* Skip orig_ax and ip */
-	popf			/* Pop flags at end (no addl to corrupt flags) */
-	jmp ftrace_ret
-
-	popf
-	jmp  ftrace_stub
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
-	cmpl $__PAGE_OFFSET, %esp
-	jb ftrace_stub		/* Paging not enabled yet? */
-
-	cmpl $ftrace_stub, ftrace_trace_function
-	jnz trace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	cmpl $ftrace_stub, ftrace_graph_return
-	jnz ftrace_graph_caller
-
-	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
-	jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
-	ret
-
-	/* taken from glibc */
-trace:
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	movl 0x4(%ebp), %edx
-	subl $MCOUNT_INSN_SIZE, %eax
-
-	call *ftrace_trace_function
-
-	popl %edx
-	popl %ecx
-	popl %eax
-	jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	lea 0x4(%ebp), %edx
-	movl (%ebp), %ecx
-	subl $MCOUNT_INSN_SIZE, %eax
-	call prepare_ftrace_return
-	popl %edx
-	popl %ecx
-	popl %eax
-	ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
-	pushl %eax
-	pushl %edx
-	movl %ebp, %eax
-	call ftrace_return_to_handler
-	movl %eax, %ecx
-	popl %edx
-	popl %eax
-	jmp *%ecx
-#endif
-
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $trace_do_page_fault
-	jmp error_code
-	CFI_ENDPROC
-END(trace_page_fault)
-#endif
-
-ENTRY(page_fault)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_page_fault
-	ALIGN
-error_code:
-	/* the function address is in %gs's slot on the stack */
-	pushl_cfi %fs
-	/*CFI_REL_OFFSET fs, 0*/
-	pushl_cfi %es
-	/*CFI_REL_OFFSET es, 0*/
-	pushl_cfi %ds
-	/*CFI_REL_OFFSET ds, 0*/
-	pushl_cfi_reg eax
-	pushl_cfi_reg ebp
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg edx
-	pushl_cfi_reg ecx
-	pushl_cfi_reg ebx
-	cld
-	movl $(__KERNEL_PERCPU), %ecx
-	movl %ecx, %fs
-	UNWIND_ESPFIX_STACK
-	GS_TO_REG %ecx
-	movl PT_GS(%esp), %edi		# get the function address
-	movl PT_ORIG_EAX(%esp), %edx	# get the error code
-	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	REG_TO_PTGS %ecx
-	SET_KERNEL_GS %ecx
-	movl $(__USER_DS), %ecx
-	movl %ecx, %ds
-	movl %ecx, %es
-	TRACE_IRQS_OFF
-	movl %esp,%eax			# pt_regs pointer
-	call *%edi
-	jmp ret_from_exception
-	CFI_ENDPROC
-END(page_fault)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
-	cmpw $__KERNEL_CS, 4(%esp)
-	jne \ok
-\label:
-	movl TSS_sysenter_sp0 + \offset(%esp), %esp
-	CFI_DEF_CFA esp, 0
-	CFI_UNDEFINED eip
-	pushfl_cfi
-	pushl_cfi $__KERNEL_CS
-	pushl_cfi $sysenter_past_esp
-	CFI_REL_OFFSET eip, 0
-.endm
-
-ENTRY(debug)
-	RING0_INT_FRAME
-	ASM_CLAC
-	cmpl $ia32_sysenter_target,(%esp)
-	jne debug_stack_correct
-	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
-	pushl_cfi $-1			# mark this as an int
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx			# error code 0
-	movl %esp,%eax			# pt_regs pointer
-	call do_debug
-	jmp ret_from_exception
-	CFI_ENDPROC
-END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got  an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-ENTRY(nmi)
-	RING0_INT_FRAME
-	ASM_CLAC
-#ifdef CONFIG_X86_ESPFIX32
-	pushl_cfi %eax
-	movl %ss, %eax
-	cmpw $__ESPFIX_SS, %ax
-	popl_cfi %eax
-	je nmi_espfix_stack
-#endif
-	cmpl $ia32_sysenter_target,(%esp)
-	je nmi_stack_fixup
-	pushl_cfi %eax
-	movl %esp,%eax
-	/* Do not access memory above the end of our stack page,
-	 * it might not exist.
-	 */
-	andl $(THREAD_SIZE-1),%eax
-	cmpl $(THREAD_SIZE-20),%eax
-	popl_cfi %eax
-	jae nmi_stack_correct
-	cmpl $ia32_sysenter_target,12(%esp)
-	je nmi_debug_stack_check
-nmi_stack_correct:
-	/* We have a RING0_INT_FRAME here */
-	pushl_cfi %eax
-	SAVE_ALL
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_nmi
-	jmp restore_all_notrace
-	CFI_ENDPROC
-
-nmi_stack_fixup:
-	RING0_INT_FRAME
-	FIX_STACK 12, nmi_stack_correct, 1
-	jmp nmi_stack_correct
-
-nmi_debug_stack_check:
-	/* We have a RING0_INT_FRAME here */
-	cmpw $__KERNEL_CS,16(%esp)
-	jne nmi_stack_correct
-	cmpl $debug,(%esp)
-	jb nmi_stack_correct
-	cmpl $debug_esp_fix_insn,(%esp)
-	ja nmi_stack_correct
-	FIX_STACK 24, nmi_stack_correct, 1
-	jmp nmi_stack_correct
-
-#ifdef CONFIG_X86_ESPFIX32
-nmi_espfix_stack:
-	/* We have a RING0_INT_FRAME here.
-	 *
-	 * create the pointer to lss back
-	 */
-	pushl_cfi %ss
-	pushl_cfi %esp
-	addl $4, (%esp)
-	/* copy the iret frame of 12 bytes */
-	.rept 3
-	pushl_cfi 16(%esp)
-	.endr
-	pushl_cfi %eax
-	SAVE_ALL
-	FIXUP_ESPFIX_STACK		# %eax == %esp
-	xorl %edx,%edx			# zero error code
-	call do_nmi
-	RESTORE_REGS
-	lss 12+4(%esp), %esp		# back to espfix stack
-	CFI_ADJUST_CFA_OFFSET -24
-	jmp irq_return
-#endif
-	CFI_ENDPROC
-END(nmi)
-
-ENTRY(int3)
-	RING0_INT_FRAME
-	ASM_CLAC
-	pushl_cfi $-1			# mark this as an int
-	SAVE_ALL
-	TRACE_IRQS_OFF
-	xorl %edx,%edx		# zero error code
-	movl %esp,%eax		# pt_regs pointer
-	call do_int3
-	jmp ret_from_exception
-	CFI_ENDPROC
-END(int3)
-
-ENTRY(general_protection)
-	RING0_EC_FRAME
-	pushl_cfi $do_general_protection
-	jmp error_code
-	CFI_ENDPROC
-END(general_protection)
-
-#ifdef CONFIG_KVM_GUEST
-ENTRY(async_page_fault)
-	RING0_EC_FRAME
-	ASM_CLAC
-	pushl_cfi $do_async_page_fault
-	jmp error_code
-	CFI_ENDPROC
-END(async_page_fault)
-#endif
-
diff --git a/arch/x86/kernel/fpu/Makefile b/arch/x86/kernel/fpu/Makefile
new file mode 100644
index 000000000000..68279efb811a
--- /dev/null
+++ b/arch/x86/kernel/fpu/Makefile
@@ -0,0 +1,5 @@
+#
+# Build rules for the FPU support code:
+#
+
+obj-y				+= init.o bugs.o core.o regset.o signal.o xstate.o
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
new file mode 100644
index 000000000000..dd9ca9b60ff3
--- /dev/null
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -0,0 +1,71 @@
+/*
+ * x86 FPU bug checks:
+ */
+#include <asm/fpu/internal.h>
+
+/*
+ * Boot time CPU/FPU FDIV bug detection code:
+ */
+
+static double __initdata x = 4195835.0;
+static double __initdata y = 3145727.0;
+
+/*
+ * This used to check for exceptions..
+ * However, it turns out that to support that,
+ * the XMM trap handlers basically had to
+ * be buggy. So let's have a correct XMM trap
+ * handler, and forget about printing out
+ * some status at boot.
+ *
+ * We should really only care about bugs here
+ * anyway. Not features.
+ */
+static void __init check_fpu(void)
+{
+	u32 cr0_saved;
+	s32 fdiv_bug;
+
+	/* We might have CR0::TS set already, clear it: */
+	cr0_saved = read_cr0();
+	write_cr0(cr0_saved & ~X86_CR0_TS);
+
+	kernel_fpu_begin();
+
+	/*
+	 * trap_init() enabled FXSR and company _before_ testing for FP
+	 * problems here.
+	 *
+	 * Test for the divl bug: http://en.wikipedia.org/wiki/Fdiv_bug
+	 */
+	__asm__("fninit\n\t"
+		"fldl %1\n\t"
+		"fdivl %2\n\t"
+		"fmull %2\n\t"
+		"fldl %1\n\t"
+		"fsubp %%st,%%st(1)\n\t"
+		"fistpl %0\n\t"
+		"fwait\n\t"
+		"fninit"
+		: "=m" (*&fdiv_bug)
+		: "m" (*&x), "m" (*&y));
+
+	kernel_fpu_end();
+
+	write_cr0(cr0_saved);
+
+	if (fdiv_bug) {
+		set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
+		pr_warn("Hmm, FPU with FDIV bug\n");
+	}
+}
+
+void __init fpu__init_check_bugs(void)
+{
+	/*
+	 * kernel_fpu_begin/end() in check_fpu() relies on the patched
+	 * alternative instructions.
+	 */
+	if (cpu_has_fpu)
+		check_fpu();
+}
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
new file mode 100644
index 000000000000..79de954626fd
--- /dev/null
+++ b/arch/x86/kernel/fpu/core.c
@@ -0,0 +1,523 @@
+/*
+ *  Copyright (C) 1994 Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *  General FPU state handling cleanups
+ *	Gareth Hughes <gareth@valinux.com>, May 2000
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/regset.h>
+#include <asm/fpu/signal.h>
+#include <asm/traps.h>
+
+#include <linux/hardirq.h>
+
+/*
+ * Represents the initial FPU state. It's mostly (but not completely) zeroes,
+ * depending on the FPU hardware format:
+ */
+union fpregs_state init_fpstate __read_mostly;
+
+/*
+ * Track whether the kernel is using the FPU state
+ * currently.
+ *
+ * This flag is used:
+ *
+ *   - by IRQ context code to potentially use the FPU
+ *     if it's unused.
+ *
+ *   - to debug kernel_fpu_begin()/end() correctness
+ */
+static DEFINE_PER_CPU(bool, in_kernel_fpu);
+
+/*
+ * Track which context is using the FPU on the CPU:
+ */
+DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
+
+static void kernel_fpu_disable(void)
+{
+	WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, true);
+}
+
+static void kernel_fpu_enable(void)
+{
+	WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+	this_cpu_write(in_kernel_fpu, false);
+}
+
+static bool kernel_fpu_disabled(void)
+{
+	return this_cpu_read(in_kernel_fpu);
+}
+
+/*
+ * Were we in an interrupt that interrupted kernel mode?
+ *
+ * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
+ * pair does nothing at all: the thread must not have fpu (so
+ * that we don't try to save the FPU state), and TS must
+ * be set (so that the clts/stts pair does nothing that is
+ * visible in the interrupted kernel thread).
+ *
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
+ */
+static bool interrupted_kernel_fpu_idle(void)
+{
+	if (kernel_fpu_disabled())
+		return false;
+
+	if (use_eager_fpu())
+		return true;
+
+	return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+}
+
+/*
+ * Were we in user mode (or vm86 mode) when we were
+ * interrupted?
+ *
+ * Doing kernel_fpu_begin/end() is ok if we are running
+ * in an interrupt context from user mode - we'll just
+ * save the FPU state as required.
+ */
+static bool interrupted_user_mode(void)
+{
+	struct pt_regs *regs = get_irq_regs();
+	return regs && user_mode(regs);
+}
+
+/*
+ * Can we use the FPU in kernel mode with the
+ * whole "kernel_fpu_begin/end()" sequence?
+ *
+ * It's always ok in process context (ie "not interrupt")
+ * but it is sometimes ok even from an irq.
+ */
+bool irq_fpu_usable(void)
+{
+	return !in_interrupt() ||
+		interrupted_user_mode() ||
+		interrupted_kernel_fpu_idle();
+}
+EXPORT_SYMBOL(irq_fpu_usable);
+
+void __kernel_fpu_begin(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	WARN_ON_FPU(!irq_fpu_usable());
+
+	kernel_fpu_disable();
+
+	if (fpu->fpregs_active) {
+		copy_fpregs_to_fpstate(fpu);
+	} else {
+		this_cpu_write(fpu_fpregs_owner_ctx, NULL);
+		__fpregs_activate_hw();
+	}
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(void)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (fpu->fpregs_active)
+		copy_kernel_to_fpregs(&fpu->state);
+	else
+		__fpregs_deactivate_hw();
+
+	kernel_fpu_enable();
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
+
+void kernel_fpu_begin(void)
+{
+	preempt_disable();
+	__kernel_fpu_begin();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+	__kernel_fpu_end();
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
+
+/*
+ * CR0::TS save/restore functions:
+ */
+int irq_ts_save(void)
+{
+	/*
+	 * If in process context and not atomic, we can take a spurious DNA fault.
+	 * Otherwise, doing clts() in process context requires disabling preemption
+	 * or some heavy lifting like kernel_fpu_begin()
+	 */
+	if (!in_atomic())
+		return 0;
+
+	if (read_cr0() & X86_CR0_TS) {
+		clts();
+		return 1;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_ts_save);
+
+void irq_ts_restore(int TS_state)
+{
+	if (TS_state)
+		stts();
+}
+EXPORT_SYMBOL_GPL(irq_ts_restore);
+
+/*
+ * Save the FPU state (mark it for reload if necessary):
+ *
+ * This only ever gets called for the current task.
+ */
+void fpu__save(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu);
+
+	preempt_disable();
+	if (fpu->fpregs_active) {
+		if (!copy_fpregs_to_fpstate(fpu))
+			fpregs_deactivate(fpu);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__save);
+
+/*
+ * Legacy x87 fpstate state init:
+ */
+static inline void fpstate_init_fstate(struct fregs_state *fp)
+{
+	fp->cwd = 0xffff037fu;
+	fp->swd = 0xffff0000u;
+	fp->twd = 0xffffffffu;
+	fp->fos = 0xffff0000u;
+}
+
+void fpstate_init(union fpregs_state *state)
+{
+	if (!cpu_has_fpu) {
+		fpstate_init_soft(&state->soft);
+		return;
+	}
+
+	memset(state, 0, xstate_size);
+
+	if (cpu_has_fxsr)
+		fpstate_init_fxstate(&state->fxsave);
+	else
+		fpstate_init_fstate(&state->fsave);
+}
+EXPORT_SYMBOL_GPL(fpstate_init);
+
+/*
+ * Copy the current task's FPU state to a new task's FPU context.
+ *
+ * In both the 'eager' and the 'lazy' case we save hardware registers
+ * directly to the destination buffer.
+ */
+static void fpu_copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+	WARN_ON_FPU(src_fpu != &current->thread.fpu);
+
+	/*
+	 * Don't let 'init optimized' areas of the XSAVE area
+	 * leak into the child task:
+	 */
+	if (use_eager_fpu())
+		memset(&dst_fpu->state.xsave, 0, xstate_size);
+
+	/*
+	 * Save current FPU registers directly into the child
+	 * FPU context, without any memory-to-memory copying.
+	 *
+	 * If the FPU context got destroyed in the process (FNSAVE
+	 * done on old CPUs) then copy it back into the source
+	 * context and mark the current task for lazy restore.
+	 *
+	 * We have to do all this with preemption disabled,
+	 * mostly because of the FNSAVE case, because in that
+	 * case we must not allow preemption in the window
+	 * between the FNSAVE and us marking the context lazy.
+	 *
+	 * It shouldn't be an issue as even FNSAVE is plenty
+	 * fast in terms of critical section length.
+	 */
+	preempt_disable();
+	if (!copy_fpregs_to_fpstate(dst_fpu)) {
+		memcpy(&src_fpu->state, &dst_fpu->state, xstate_size);
+		fpregs_deactivate(src_fpu);
+	}
+	preempt_enable();
+}
+
+int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
+{
+	dst_fpu->counter = 0;
+	dst_fpu->fpregs_active = 0;
+	dst_fpu->last_cpu = -1;
+
+	if (src_fpu->fpstate_active)
+		fpu_copy(dst_fpu, src_fpu);
+
+	return 0;
+}
+
+/*
+ * Activate the current task's in-memory FPU context,
+ * if it has not been used before:
+ */
+void fpu__activate_curr(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu);
+
+	if (!fpu->fpstate_active) {
+		fpstate_init(&fpu->state);
+
+		/* Safe to do for the current task: */
+		fpu->fpstate_active = 1;
+	}
+}
+EXPORT_SYMBOL_GPL(fpu__activate_curr);
+
+/*
+ * This function must be called before we read a task's fpstate.
+ *
+ * If the task has not used the FPU before then initialize its
+ * fpstate.
+ *
+ * If the task has used the FPU before then save it.
+ */
+void fpu__activate_fpstate_read(struct fpu *fpu)
+{
+	/*
+	 * If fpregs are active (in the current CPU), then
+	 * copy them to the fpstate:
+	 */
+	if (fpu->fpregs_active) {
+		fpu__save(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpstate_init(&fpu->state);
+
+			/* Safe to do for current and for stopped child tasks: */
+			fpu->fpstate_active = 1;
+		}
+	}
+}
+
+/*
+ * This function must be called before we write a task's fpstate.
+ *
+ * If the task has used the FPU before then unlazy it.
+ * If the task has not used the FPU before then initialize its fpstate.
+ *
+ * After this function call, after registers in the fpstate are
+ * modified and the child task has woken up, the child task will
+ * restore the modified FPU state from the modified context. If we
+ * didn't clear its lazy status here then the lazy in-registers
+ * state pending on its former CPU could be restored, corrupting
+ * the modifications.
+ */
+void fpu__activate_fpstate_write(struct fpu *fpu)
+{
+	/*
+	 * Only stopped child tasks can be used to modify the FPU
+	 * state in the fpstate buffer:
+	 */
+	WARN_ON_FPU(fpu == &current->thread.fpu);
+
+	if (fpu->fpstate_active) {
+		/* Invalidate any lazy state: */
+		fpu->last_cpu = -1;
+	} else {
+		fpstate_init(&fpu->state);
+
+		/* Safe to do for stopped child tasks: */
+		fpu->fpstate_active = 1;
+	}
+}
+
+/*
+ * 'fpu__restore()' is called to copy FPU registers from
+ * the FPU fpstate to the live hw registers and to activate
+ * access to the hardware registers, so that FPU instructions
+ * can be used afterwards.
+ *
+ * Must be called with kernel preemption disabled (for example
+ * with local interrupts disabled, as it is in the case of
+ * do_device_not_available()).
+ */
+void fpu__restore(struct fpu *fpu)
+{
+	fpu__activate_curr(fpu);
+
+	/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
+	kernel_fpu_disable();
+	fpregs_activate(fpu);
+	copy_kernel_to_fpregs(&fpu->state);
+	fpu->counter++;
+	kernel_fpu_enable();
+}
+EXPORT_SYMBOL_GPL(fpu__restore);
+
+/*
+ * Drops current FPU state: deactivates the fpregs and
+ * the fpstate. NOTE: it still leaves previous contents
+ * in the fpregs in the eager-FPU case.
+ *
+ * This function can be used in cases where we know that
+ * a state-restore is coming: either an explicit one,
+ * or a reschedule.
+ */
+void fpu__drop(struct fpu *fpu)
+{
+	preempt_disable();
+	fpu->counter = 0;
+
+	if (fpu->fpregs_active) {
+		/* Ignore delayed exceptions from user space */
+		asm volatile("1: fwait\n"
+			     "2:\n"
+			     _ASM_EXTABLE(1b, 2b));
+		fpregs_deactivate(fpu);
+	}
+
+	fpu->fpstate_active = 0;
+
+	preempt_enable();
+}
+
+/*
+ * Clear FPU registers by setting them up from
+ * the init fpstate:
+ */
+static inline void copy_init_fpstate_to_fpregs(void)
+{
+	if (use_xsave())
+		copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+	else
+		copy_kernel_to_fxregs(&init_fpstate.fxsave);
+}
+
+/*
+ * Clear the FPU state back to init state.
+ *
+ * Called by sys_execve(), by the signal handler code and by various
+ * error paths.
+ */
+void fpu__clear(struct fpu *fpu)
+{
+	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
+
+	if (!use_eager_fpu()) {
+		/* FPU state will be reallocated lazily at the first use. */
+		fpu__drop(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpu__activate_curr(fpu);
+			user_fpu_begin();
+		}
+		copy_init_fpstate_to_fpregs();
+	}
+}
+
+/*
+ * x87 math exception handling:
+ */
+
+static inline unsigned short get_fpu_cwd(struct fpu *fpu)
+{
+	if (cpu_has_fxsr) {
+		return fpu->state.fxsave.cwd;
+	} else {
+		return (unsigned short)fpu->state.fsave.cwd;
+	}
+}
+
+static inline unsigned short get_fpu_swd(struct fpu *fpu)
+{
+	if (cpu_has_fxsr) {
+		return fpu->state.fxsave.swd;
+	} else {
+		return (unsigned short)fpu->state.fsave.swd;
+	}
+}
+
+static inline unsigned short get_fpu_mxcsr(struct fpu *fpu)
+{
+	if (cpu_has_xmm) {
+		return fpu->state.fxsave.mxcsr;
+	} else {
+		return MXCSR_DEFAULT;
+	}
+}
+
+int fpu__exception_code(struct fpu *fpu, int trap_nr)
+{
+	int err;
+
+	if (trap_nr == X86_TRAP_MF) {
+		unsigned short cwd, swd;
+		/*
+		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
+		 * status.  0x3f is the exception bits in these regs, 0x200 is the
+		 * C1 reg you need in case of a stack fault, 0x040 is the stack
+		 * fault bit.  We should only be taking one exception at a time,
+		 * so if this combination doesn't produce any single exception,
+		 * then we have a bad program that isn't synchronizing its FPU usage
+		 * and it will suffer the consequences since we won't be able to
+		 * fully reproduce the context of the exception
+		 */
+		cwd = get_fpu_cwd(fpu);
+		swd = get_fpu_swd(fpu);
+
+		err = swd & ~cwd;
+	} else {
+		/*
+		 * The SIMD FPU exceptions are handled a little differently, as there
+		 * is only a single status/control register.  Thus, to determine which
+		 * unmasked exception was caught we must mask the exception mask bits
+		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+		 */
+		unsigned short mxcsr = get_fpu_mxcsr(fpu);
+		err = ~(mxcsr >> 7) & mxcsr;
+	}
+
+	if (err & 0x001) {	/* Invalid op */
+		/*
+		 * swd & 0x240 == 0x040: Stack Underflow
+		 * swd & 0x240 == 0x240: Stack Overflow
+		 * User must clear the SF bit (0x40) if set
+		 */
+		return FPE_FLTINV;
+	} else if (err & 0x004) { /* Divide by Zero */
+		return FPE_FLTDIV;
+	} else if (err & 0x008) { /* Overflow */
+		return FPE_FLTOVF;
+	} else if (err & 0x012) { /* Denormal, Underflow */
+		return FPE_FLTUND;
+	} else if (err & 0x020) { /* Precision */
+		return FPE_FLTRES;
+	}
+
+	/*
+	 * If we're using IRQ 13, or supposedly even some trap
+	 * X86_TRAP_MF implementations, it's possible
+	 * we get a spurious trap, which is not an error.
+	 */
+	return 0;
+}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
new file mode 100644
index 000000000000..fc878fee6a51
--- /dev/null
+++ b/arch/x86/kernel/fpu/init.c
@@ -0,0 +1,354 @@
+/*
+ * x86 FPU boot time init code:
+ */
+#include <asm/fpu/internal.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Initialize the TS bit in CR0 according to the style of context-switches
+ * we are using:
+ */
+static void fpu__init_cpu_ctx_switch(void)
+{
+	if (!cpu_has_eager_fpu)
+		stts();
+	else
+		clts();
+}
+
+/*
+ * Initialize the registers found in all CPUs, CR0 and CR4:
+ */
+static void fpu__init_cpu_generic(void)
+{
+	unsigned long cr0;
+	unsigned long cr4_mask = 0;
+
+	if (cpu_has_fxsr)
+		cr4_mask |= X86_CR4_OSFXSR;
+	if (cpu_has_xmm)
+		cr4_mask |= X86_CR4_OSXMMEXCPT;
+	if (cr4_mask)
+		cr4_set_bits(cr4_mask);
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+	if (!cpu_has_fpu)
+		cr0 |= X86_CR0_EM;
+	write_cr0(cr0);
+
+	/* Flush out any pending x87 state: */
+	asm volatile ("fninit");
+}
+
+/*
+ * Enable all supported FPU features. Called when a CPU is brought online:
+ */
+void fpu__init_cpu(void)
+{
+	fpu__init_cpu_generic();
+	fpu__init_cpu_xstate();
+	fpu__init_cpu_ctx_switch();
+}
+
+/*
+ * The earliest FPU detection code.
+ *
+ * Set the X86_FEATURE_FPU CPU-capability bit based on
+ * trying to execute an actual sequence of FPU instructions:
+ */
+static void fpu__init_system_early_generic(struct cpuinfo_x86 *c)
+{
+	unsigned long cr0;
+	u16 fsw, fcw;
+
+	fsw = fcw = 0xffff;
+
+	cr0 = read_cr0();
+	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
+	write_cr0(cr0);
+
+	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
+		     : "+m" (fsw), "+m" (fcw));
+
+	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
+		set_cpu_cap(c, X86_FEATURE_FPU);
+	else
+		clear_cpu_cap(c, X86_FEATURE_FPU);
+
+#ifndef CONFIG_MATH_EMULATION
+	if (!cpu_has_fpu) {
+		pr_emerg("x86/fpu: Giving up, no FPU found and no math emulation present\n");
+		for (;;)
+			asm volatile("hlt");
+	}
+#endif
+}
+
+/*
+ * Boot time FPU feature detection code:
+ */
+unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+
+static void __init fpu__init_system_mxcsr(void)
+{
+	unsigned int mask = 0;
+
+	if (cpu_has_fxsr) {
+		struct fxregs_state fx_tmp __aligned(32) = { };
+
+		asm volatile("fxsave %0" : "+m" (fx_tmp));
+
+		mask = fx_tmp.mxcsr_mask;
+
+		/*
+		 * If zero then use the default features mask,
+		 * which has all features set, except the
+		 * denormals-are-zero feature bit:
+		 */
+		if (mask == 0)
+			mask = 0x0000ffbf;
+	}
+	mxcsr_feature_mask &= mask;
+}
+
+/*
+ * Once per bootup FPU initialization sequences that will run on most x86 CPUs:
+ */
+static void __init fpu__init_system_generic(void)
+{
+	/*
+	 * Set up the legacy init FPU context. (xstate init might overwrite this
+	 * with a more modern format, if the CPU supports it.)
+	 */
+	fpstate_init_fxstate(&init_fpstate.fxsave);
+
+	fpu__init_system_mxcsr();
+}
+
+/*
+ * Size of the FPU context state. All tasks in the system use the
+ * same context size, regardless of what portion they use.
+ * This is inherent to the XSAVE architecture which puts all state
+ * components into a single, continuous memory block:
+ */
+unsigned int xstate_size;
+EXPORT_SYMBOL_GPL(xstate_size);
+
+/*
+ * Set up the xstate_size based on the legacy FPU context size.
+ *
+ * We set this up first, and later it will be overwritten by
+ * fpu__init_system_xstate() if the CPU knows about xstates.
+ */
+static void __init fpu__init_system_xstate_size_legacy(void)
+{
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	/*
+	 * Note that xstate_size might be overwriten later during
+	 * fpu__init_system_xstate().
+	 */
+
+	if (!cpu_has_fpu) {
+		/*
+		 * Disable xsave as we do not support it if i387
+		 * emulation is enabled.
+		 */
+		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+		xstate_size = sizeof(struct swregs_state);
+	} else {
+		if (cpu_has_fxsr)
+			xstate_size = sizeof(struct fxregs_state);
+		else
+			xstate_size = sizeof(struct fregs_state);
+	}
+	/*
+	 * Quirk: we don't yet handle the XSAVES* instructions
+	 * correctly, as we don't correctly convert between
+	 * standard and compacted format when interfacing
+	 * with user-space - so disable it for now.
+	 *
+	 * The difference is small: with recent CPUs the
+	 * compacted format is only marginally smaller than
+	 * the standard FPU state format.
+	 *
+	 * ( This is easy to backport while we are fixing
+	 *   XSAVES* support. )
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
+ * FPU context switching strategies:
+ *
+ * Against popular belief, we don't do lazy FPU saves, due to the
+ * task migration complications it brings on SMP - we only do
+ * lazy FPU restores.
+ *
+ * 'lazy' is the traditional strategy, which is based on setting
+ * CR0::TS to 1 during context-switch (instead of doing a full
+ * restore of the FPU state), which causes the first FPU instruction
+ * after the context switch (whenever it is executed) to fault - at
+ * which point we lazily restore the FPU state into FPU registers.
+ *
+ * Tasks are of course under no obligation to execute FPU instructions,
+ * so it can easily happen that another context-switch occurs without
+ * a single FPU instruction being executed. If we eventually switch
+ * back to the original task (that still owns the FPU) then we have
+ * not only saved the restores along the way, but we also have the
+ * FPU ready to be used for the original task.
+ *
+ * 'eager' switching is used on modern CPUs, there we switch the FPU
+ * state during every context switch, regardless of whether the task
+ * has used FPU instructions in that time slice or not. This is done
+ * because modern FPU context saving instructions are able to optimize
+ * state saving and restoration in hardware: they can detect both
+ * unused and untouched FPU state and optimize accordingly.
+ *
+ * [ Note that even in 'lazy' mode we might optimize context switches
+ *   to use 'eager' restores, if we detect that a task is using the FPU
+ *   frequently. See the fpu->counter logic in fpu/internal.h for that. ]
+ */
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+
+static int __init eager_fpu_setup(char *s)
+{
+	if (!strcmp(s, "on"))
+		eagerfpu = ENABLE;
+	else if (!strcmp(s, "off"))
+		eagerfpu = DISABLE;
+	else if (!strcmp(s, "auto"))
+		eagerfpu = AUTO;
+	return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
+/*
+ * Pick the FPU context switching strategy:
+ */
+static void __init fpu__init_system_ctx_switch(void)
+{
+	static bool on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	WARN_ON_FPU(current->thread.fpu.fpstate_active);
+	current_thread_info()->status = 0;
+
+	/* Auto enable eagerfpu for xsaveopt */
+	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
+		eagerfpu = ENABLE;
+
+	if (xfeatures_mask & XSTATE_EAGER) {
+		if (eagerfpu == DISABLE) {
+			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+			       xfeatures_mask & XSTATE_EAGER);
+			xfeatures_mask &= ~XSTATE_EAGER;
+		} else {
+			eagerfpu = ENABLE;
+		}
+	}
+
+	if (eagerfpu == ENABLE)
+		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
+
+	printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
+}
+
+/*
+ * Called on the boot CPU once per system bootup, to set up the initial
+ * FPU state that is later cloned into all processes:
+ */
+void __init fpu__init_system(struct cpuinfo_x86 *c)
+{
+	fpu__init_system_early_generic(c);
+
+	/*
+	 * The FPU has to be operational for some of the
+	 * later FPU init activities:
+	 */
+	fpu__init_cpu();
+
+	/*
+	 * But don't leave CR0::TS set yet, as some of the FPU setup
+	 * methods depend on being able to execute FPU instructions
+	 * that will fault on a set TS, such as the FXSAVE in
+	 * fpu__init_system_mxcsr().
+	 */
+	clts();
+
+	fpu__init_system_generic();
+	fpu__init_system_xstate_size_legacy();
+	fpu__init_system_xstate();
+
+	fpu__init_system_ctx_switch();
+}
+
+/*
+ * Boot parameter to turn off FPU support and fall back to math-emu:
+ */
+static int __init no_387(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FPU);
+	return 1;
+}
+__setup("no387", no_387);
+
+/*
+ * Disable all xstate CPU features:
+ */
+static int __init x86_noxsave_setup(char *s)
+{
+	if (strlen(s))
+		return 0;
+
+	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+
+	return 1;
+}
+__setup("noxsave", x86_noxsave_setup);
+
+/*
+ * Disable the XSAVEOPT instruction specifically:
+ */
+static int __init x86_noxsaveopt_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	return 1;
+}
+__setup("noxsaveopt", x86_noxsaveopt_setup);
+
+/*
+ * Disable the XSAVES instruction:
+ */
+static int __init x86_noxsaves_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	return 1;
+}
+__setup("noxsaves", x86_noxsaves_setup);
+
+/*
+ * Disable FX save/restore and SSE support:
+ */
+static int __init x86_nofxsr_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
+
+	return 1;
+}
+__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
new file mode 100644
index 000000000000..dc60810c1c74
--- /dev/null
+++ b/arch/x86/kernel/fpu/regset.c
@@ -0,0 +1,356 @@
+/*
+ * FPU register's regset abstraction, for ptrace, core dumps, etc.
+ */
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+/*
+ * The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
+ * as the "regset->n" for the xstate regset will be updated based on the feature
+ * capabilites supported by the xsave.
+ */
+int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	struct fpu *target_fpu = &target->thread.fpu;
+
+	return target_fpu->fpstate_active ? regset->n : 0;
+}
+
+int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
+{
+	struct fpu *target_fpu = &target->thread.fpu;
+
+	return (cpu_has_fxsr && target_fpu->fpstate_active) ? regset->n : 0;
+}
+
+int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	fpu__activate_fpstate_read(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   &fpu->state.fxsave, 0, -1);
+}
+
+int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	int ret;
+
+	if (!cpu_has_fxsr)
+		return -ENODEV;
+
+	fpu__activate_fpstate_write(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &fpu->state.fxsave, 0, -1);
+
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+
+	/*
+	 * update the header bits in the xsave header, indicating the
+	 * presence of FP and SSE state.
+	 */
+	if (cpu_has_xsave)
+		fpu->state.xsave.header.xfeatures |= XSTATE_FPSSE;
+
+	return ret;
+}
+
+int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
+		unsigned int pos, unsigned int count,
+		void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct xregs_state *xsave;
+	int ret;
+
+	if (!cpu_has_xsave)
+		return -ENODEV;
+
+	fpu__activate_fpstate_read(fpu);
+
+	xsave = &fpu->state.xsave;
+
+	/*
+	 * Copy the 48bytes defined by the software first into the xstate
+	 * memory layout in the thread struct, so that we can copy the entire
+	 * xstateregs to the user using one user_regset_copyout().
+	 */
+	memcpy(&xsave->i387.sw_reserved,
+		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
+	/*
+	 * Copy the xstate memory layout.
+	 */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	return ret;
+}
+
+int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
+		  unsigned int pos, unsigned int count,
+		  const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct xregs_state *xsave;
+	int ret;
+
+	if (!cpu_has_xsave)
+		return -ENODEV;
+
+	fpu__activate_fpstate_write(fpu);
+
+	xsave = &fpu->state.xsave;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
+	/*
+	 * mxcsr reserved bits must be masked to zero for security reasons.
+	 */
+	xsave->i387.mxcsr &= mxcsr_feature_mask;
+	xsave->header.xfeatures &= xfeatures_mask;
+	/*
+	 * These bits must be zero.
+	 */
+	memset(&xsave->header.reserved, 0, 48);
+
+	return ret;
+}
+
+#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
+
+/*
+ * FPU tag word conversions.
+ */
+
+static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
+{
+	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
+
+	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
+	tmp = ~twd;
+	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
+	/* and move the valid bits to the lower byte. */
+	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
+	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
+	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
+	return tmp;
+}
+
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
+#define FP_EXP_TAG_VALID	0
+#define FP_EXP_TAG_ZERO		1
+#define FP_EXP_TAG_SPECIAL	2
+#define FP_EXP_TAG_EMPTY	3
+
+static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
+{
+	struct _fpxreg *st;
+	u32 tos = (fxsave->swd >> 11) & 7;
+	u32 twd = (unsigned long) fxsave->twd;
+	u32 tag;
+	u32 ret = 0xffff0000u;
+	int i;
+
+	for (i = 0; i < 8; i++, twd >>= 1) {
+		if (twd & 0x1) {
+			st = FPREG_ADDR(fxsave, (i - tos) & 7);
+
+			switch (st->exponent & 0x7fff) {
+			case 0x7fff:
+				tag = FP_EXP_TAG_SPECIAL;
+				break;
+			case 0x0000:
+				if (!st->significand[0] &&
+				    !st->significand[1] &&
+				    !st->significand[2] &&
+				    !st->significand[3])
+					tag = FP_EXP_TAG_ZERO;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
+				break;
+			default:
+				if (st->significand[3] & 0x8000)
+					tag = FP_EXP_TAG_VALID;
+				else
+					tag = FP_EXP_TAG_SPECIAL;
+				break;
+			}
+		} else {
+			tag = FP_EXP_TAG_EMPTY;
+		}
+		ret |= tag << (2 * i);
+	}
+	return ret;
+}
+
+/*
+ * FXSR floating point environment conversions.
+ */
+
+void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
+{
+	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
+
+	env->cwd = fxsave->cwd | 0xffff0000u;
+	env->swd = fxsave->swd | 0xffff0000u;
+	env->twd = twd_fxsr_to_i387(fxsave);
+
+#ifdef CONFIG_X86_64
+	env->fip = fxsave->rip;
+	env->foo = fxsave->rdp;
+	/*
+	 * should be actually ds/cs at fpu exception time, but
+	 * that information is not available in 64bit mode.
+	 */
+	env->fcs = task_pt_regs(tsk)->cs;
+	if (tsk == current) {
+		savesegment(ds, env->fos);
+	} else {
+		env->fos = tsk->thread.ds;
+	}
+	env->fos |= 0xffff0000;
+#else
+	env->fip = fxsave->fip;
+	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
+	env->foo = fxsave->foo;
+	env->fos = fxsave->fos;
+#endif
+
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(to[0]));
+}
+
+void convert_to_fxsr(struct task_struct *tsk,
+		     const struct user_i387_ia32_struct *env)
+
+{
+	struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
+	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
+	int i;
+
+	fxsave->cwd = env->cwd;
+	fxsave->swd = env->swd;
+	fxsave->twd = twd_i387_to_fxsr(env->twd);
+	fxsave->fop = (u16) ((u32) env->fcs >> 16);
+#ifdef CONFIG_X86_64
+	fxsave->rip = env->fip;
+	fxsave->rdp = env->foo;
+	/* cs and ds ignored */
+#else
+	fxsave->fip = env->fip;
+	fxsave->fcs = (env->fcs & 0xffff);
+	fxsave->foo = env->foo;
+	fxsave->fos = env->fos;
+#endif
+
+	for (i = 0; i < 8; ++i)
+		memcpy(&to[i], &from[i], sizeof(from[0]));
+}
+
+int fpregs_get(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       void *kbuf, void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct user_i387_ia32_struct env;
+
+	fpu__activate_fpstate_read(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+					   &fpu->state.fsave, 0,
+					   -1);
+
+	fpstate_sanitize_xstate(fpu);
+
+	if (kbuf && pos == 0 && count == sizeof(env)) {
+		convert_from_fxsr(kbuf, target);
+		return 0;
+	}
+
+	convert_from_fxsr(&env, target);
+
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+}
+
+int fpregs_set(struct task_struct *target, const struct user_regset *regset,
+	       unsigned int pos, unsigned int count,
+	       const void *kbuf, const void __user *ubuf)
+{
+	struct fpu *fpu = &target->thread.fpu;
+	struct user_i387_ia32_struct env;
+	int ret;
+
+	fpu__activate_fpstate_write(fpu);
+	fpstate_sanitize_xstate(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
+
+	if (!cpu_has_fxsr)
+		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					  &fpu->state.fsave, 0,
+					  -1);
+
+	if (pos > 0 || count < sizeof(env))
+		convert_from_fxsr(&env, target);
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
+	if (!ret)
+		convert_to_fxsr(target, &env);
+
+	/*
+	 * update the header bit in the xsave header, indicating the
+	 * presence of FP.
+	 */
+	if (cpu_has_xsave)
+		fpu->state.xsave.header.xfeatures |= XSTATE_FP;
+	return ret;
+}
+
+/*
+ * FPU state for core dumps.
+ * This is only used for a.out dumps now.
+ * It is declared generically using elf_fpregset_t (which is
+ * struct user_i387_struct) but is in fact only used for 32-bit
+ * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
+ */
+int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
+{
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	int fpvalid;
+
+	fpvalid = fpu->fpstate_active;
+	if (fpvalid)
+		fpvalid = !fpregs_get(tsk, NULL,
+				      0, sizeof(struct user_i387_ia32_struct),
+				      ufpu, NULL);
+
+	return fpvalid;
+}
+EXPORT_SYMBOL(dump_fpu);
+
+#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
new file mode 100644
index 000000000000..50ec9af1bd51
--- /dev/null
+++ b/arch/x86/kernel/fpu/signal.c
@@ -0,0 +1,404 @@
+/*
+ * FPU signal frame handling routines.
+ */
+
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/sigframe.h>
+
+static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+static inline int check_for_xstate(struct fxregs_state __user *buf,
+				   void __user *fpstate,
+				   struct _fpx_sw_bytes *fx_sw)
+{
+	int min_xstate_size = sizeof(struct fxregs_state) +
+			      sizeof(struct xstate_header);
+	unsigned int magic2;
+
+	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
+		return -1;
+
+	/* Check for the first magic field and other error scenarios. */
+	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
+	    fx_sw->xstate_size < min_xstate_size ||
+	    fx_sw->xstate_size > xstate_size ||
+	    fx_sw->xstate_size > fx_sw->extended_size)
+		return -1;
+
+	/*
+	 * Check for the presence of second magic word at the end of memory
+	 * layout. This detects the case where the user just copied the legacy
+	 * fpstate layout with out copying the extended state information
+	 * in the memory layout.
+	 */
+	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
+	    || magic2 != FP_XSTATE_MAGIC2)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Signal frame handlers.
+ */
+static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
+{
+	if (use_fxsr()) {
+		struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+		struct user_i387_ia32_struct env;
+		struct _fpstate_ia32 __user *fp = buf;
+
+		convert_from_fxsr(&env, tsk);
+
+		if (__copy_to_user(buf, &env, sizeof(env)) ||
+		    __put_user(xsave->i387.swd, &fp->status) ||
+		    __put_user(X86_FXSR_MAGIC, &fp->magic))
+			return -1;
+	} else {
+		struct fregs_state __user *fp = buf;
+		u32 swd;
+		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
+			return -1;
+	}
+
+	return 0;
+}
+
+static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
+{
+	struct xregs_state __user *x = buf;
+	struct _fpx_sw_bytes *sw_bytes;
+	u32 xfeatures;
+	int err;
+
+	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
+	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
+	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
+
+	if (!use_xsave())
+		return err;
+
+	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
+
+	/*
+	 * Read the xfeatures which we copied (directly from the cpu or
+	 * from the state in task struct) to the user buffers.
+	 */
+	err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+	/*
+	 * For legacy compatible, we always set FP/SSE bits in the bit
+	 * vector while saving the state to the user context. This will
+	 * enable us capturing any changes(during sigreturn) to
+	 * the FP/SSE bits by the legacy applications which don't touch
+	 * xfeatures in the xsave header.
+	 *
+	 * xsave aware apps can change the xfeatures in the xsave
+	 * header as well as change any contents in the memory layout.
+	 * xrestore as part of sigreturn will capture all the changes.
+	 */
+	xfeatures |= XSTATE_FPSSE;
+
+	err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures);
+
+	return err;
+}
+
+static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
+{
+	int err;
+
+	if (use_xsave())
+		err = copy_xregs_to_user(buf);
+	else if (use_fxsr())
+		err = copy_fxregs_to_user((struct fxregs_state __user *) buf);
+	else
+		err = copy_fregs_to_user((struct fregs_state __user *) buf);
+
+	if (unlikely(err) && __clear_user(buf, xstate_size))
+		err = -EFAULT;
+	return err;
+}
+
+/*
+ * Save the fpu, extended register state to the user signal frame.
+ *
+ * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
+ *  state is copied.
+ *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
+ *
+ *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
+ *	buf != buf_fx for 32-bit frames with fxstate.
+ *
+ * If the fpu, extended register state is live, save the state directly
+ * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
+ * copy the thread's fpu state to the user frame starting at 'buf_fx'.
+ *
+ * If this is a 32-bit frame with fxstate, put a fsave header before
+ * the aligned state at 'buf_fx'.
+ *
+ * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
+ * indicating the absence/presence of the extended state to the user.
+ */
+int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
+{
+	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+	struct task_struct *tsk = current;
+	int ia32_fxstate = (buf != buf_fx);
+
+	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+			 config_enabled(CONFIG_IA32_EMULATION));
+
+	if (!access_ok(VERIFY_WRITE, buf, size))
+		return -EACCES;
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_get(current, NULL, 0,
+			sizeof(struct user_i387_ia32_struct), NULL,
+			(struct _fpstate_ia32 __user *) buf) ? -1 : 1;
+
+	if (fpregs_active()) {
+		/* Save the live register state to the user directly. */
+		if (copy_fpregs_to_sigframe(buf_fx))
+			return -1;
+		/* Update the thread's fxstate to save the fsave header. */
+		if (ia32_fxstate)
+			copy_fxregs_to_kernel(&tsk->thread.fpu);
+	} else {
+		fpstate_sanitize_xstate(&tsk->thread.fpu);
+		if (__copy_to_user(buf_fx, xsave, xstate_size))
+			return -1;
+	}
+
+	/* Save the fsave header for the 32-bit frames. */
+	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
+		return -1;
+
+	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
+		return -1;
+
+	return 0;
+}
+
+static inline void
+sanitize_restored_xstate(struct task_struct *tsk,
+			 struct user_i387_ia32_struct *ia32_env,
+			 u64 xfeatures, int fx_only)
+{
+	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+	struct xstate_header *header = &xsave->header;
+
+	if (use_xsave()) {
+		/* These bits must be zero. */
+		memset(header->reserved, 0, 48);
+
+		/*
+		 * Init the state that is not present in the memory
+		 * layout and not enabled by the OS.
+		 */
+		if (fx_only)
+			header->xfeatures = XSTATE_FPSSE;
+		else
+			header->xfeatures &= (xfeatures_mask & xfeatures);
+	}
+
+	if (use_fxsr()) {
+		/*
+		 * mscsr reserved bits must be masked to zero for security
+		 * reasons.
+		 */
+		xsave->i387.mxcsr &= mxcsr_feature_mask;
+
+		convert_to_fxsr(tsk, ia32_env);
+	}
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE state.
+ */
+static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
+{
+	if (use_xsave()) {
+		if ((unsigned long)buf % 64 || fx_only) {
+			u64 init_bv = xfeatures_mask & ~XSTATE_FPSSE;
+			copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return copy_user_to_fxregs(buf);
+		} else {
+			u64 init_bv = xfeatures_mask & ~xbv;
+			if (unlikely(init_bv))
+				copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
+			return copy_user_to_xregs(buf, xbv);
+		}
+	} else if (use_fxsr()) {
+		return copy_user_to_fxregs(buf);
+	} else
+		return copy_user_to_fregs(buf);
+}
+
+static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
+{
+	int ia32_fxstate = (buf != buf_fx);
+	struct task_struct *tsk = current;
+	struct fpu *fpu = &tsk->thread.fpu;
+	int state_size = xstate_size;
+	u64 xfeatures = 0;
+	int fx_only = 0;
+
+	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
+			 config_enabled(CONFIG_IA32_EMULATION));
+
+	if (!buf) {
+		fpu__clear(fpu);
+		return 0;
+	}
+
+	if (!access_ok(VERIFY_READ, buf, size))
+		return -EACCES;
+
+	fpu__activate_curr(fpu);
+
+	if (!static_cpu_has(X86_FEATURE_FPU))
+		return fpregs_soft_set(current, NULL,
+				       0, sizeof(struct user_i387_ia32_struct),
+				       NULL, buf) != 0;
+
+	if (use_xsave()) {
+		struct _fpx_sw_bytes fx_sw_user;
+		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
+			/*
+			 * Couldn't find the extended state information in the
+			 * memory layout. Restore just the FP/SSE and init all
+			 * the other extended state.
+			 */
+			state_size = sizeof(struct fxregs_state);
+			fx_only = 1;
+		} else {
+			state_size = fx_sw_user.xstate_size;
+			xfeatures = fx_sw_user.xfeatures;
+		}
+	}
+
+	if (ia32_fxstate) {
+		/*
+		 * For 32-bit frames with fxstate, copy the user state to the
+		 * thread's fpu state, reconstruct fxstate from the fsave
+		 * header. Sanitize the copied state etc.
+		 */
+		struct fpu *fpu = &tsk->thread.fpu;
+		struct user_i387_ia32_struct env;
+		int err = 0;
+
+		/*
+		 * Drop the current fpu which clears fpu->fpstate_active. This ensures
+		 * that any context-switch during the copy of the new state,
+		 * avoids the intermediate state from getting restored/saved.
+		 * Thus avoiding the new restored state from getting corrupted.
+		 * We will be ready to restore/save the state only after
+		 * fpu->fpstate_active is again set.
+		 */
+		fpu__drop(fpu);
+
+		if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) ||
+		    __copy_from_user(&env, buf, sizeof(env))) {
+			fpstate_init(&fpu->state);
+			err = -1;
+		} else {
+			sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
+		}
+
+		fpu->fpstate_active = 1;
+		if (use_eager_fpu()) {
+			preempt_disable();
+			fpu__restore(fpu);
+			preempt_enable();
+		}
+
+		return err;
+	} else {
+		/*
+		 * For 64-bit frames and 32-bit fsave frames, restore the user
+		 * state to the registers directly (with exceptions handled).
+		 */
+		user_fpu_begin();
+		if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) {
+			fpu__clear(fpu);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int xstate_sigframe_size(void)
+{
+	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size;
+}
+
+/*
+ * Restore FPU state from a sigframe:
+ */
+int fpu__restore_sig(void __user *buf, int ia32_frame)
+{
+	void __user *buf_fx = buf;
+	int size = xstate_sigframe_size();
+
+	if (ia32_frame && use_fxsr()) {
+		buf_fx = buf + sizeof(struct fregs_state);
+		size += sizeof(struct fregs_state);
+	}
+
+	return __fpu__restore_sig(buf, buf_fx, size);
+}
+
+unsigned long
+fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
+		     unsigned long *buf_fx, unsigned long *size)
+{
+	unsigned long frame_size = xstate_sigframe_size();
+
+	*buf_fx = sp = round_down(sp - frame_size, 64);
+	if (ia32_frame && use_fxsr()) {
+		frame_size += sizeof(struct fregs_state);
+		sp -= sizeof(struct fregs_state);
+	}
+
+	*size = frame_size;
+
+	return sp;
+}
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+void fpu__init_prepare_fx_sw_frame(void)
+{
+	int fsave_header_size = sizeof(struct fregs_state);
+	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
+
+	if (config_enabled(CONFIG_X86_32))
+		size += fsave_header_size;
+
+	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+	fx_sw_reserved.extended_size = size;
+	fx_sw_reserved.xfeatures = xfeatures_mask;
+	fx_sw_reserved.xstate_size = xstate_size;
+
+	if (config_enabled(CONFIG_IA32_EMULATION)) {
+		fx_sw_reserved_ia32 = fx_sw_reserved;
+		fx_sw_reserved_ia32.extended_size += fsave_header_size;
+	}
+}
+
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
new file mode 100644
index 000000000000..62fc001c7846
--- /dev/null
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -0,0 +1,461 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/compat.h>
+#include <linux/cpu.h>
+
+#include <asm/fpu/api.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
+
+#include <asm/tlbflush.h>
+
+static const char *xfeature_names[] =
+{
+	"x87 floating point registers"	,
+	"SSE registers"			,
+	"AVX registers"			,
+	"MPX bounds registers"		,
+	"MPX CSR"			,
+	"AVX-512 opmask"		,
+	"AVX-512 Hi256"			,
+	"AVX-512 ZMM_Hi256"		,
+	"unknown xstate feature"	,
+};
+
+/*
+ * Mask of xstate features supported by the CPU and the kernel:
+ */
+u64 xfeatures_mask __read_mostly;
+
+static unsigned int xstate_offsets[XFEATURES_NR_MAX] = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_sizes[XFEATURES_NR_MAX]   = { [ 0 ... XFEATURES_NR_MAX - 1] = -1};
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8];
+
+/* The number of supported xfeatures in xfeatures_mask: */
+static unsigned int xfeatures_nr;
+
+/*
+ * Return whether the system supports a given xfeature.
+ *
+ * Also return the name of the (most advanced) feature that the caller requested:
+ */
+int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
+{
+	u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
+
+	if (unlikely(feature_name)) {
+		long xfeature_idx, max_idx;
+		u64 xfeatures_print;
+		/*
+		 * So we use FLS here to be able to print the most advanced
+		 * feature that was requested but is missing. So if a driver
+		 * asks about "XSTATE_SSE | XSTATE_YMM" we'll print the
+		 * missing AVX feature - this is the most informative message
+		 * to users:
+		 */
+		if (xfeatures_missing)
+			xfeatures_print = xfeatures_missing;
+		else
+			xfeatures_print = xfeatures_needed;
+
+		xfeature_idx = fls64(xfeatures_print)-1;
+		max_idx = ARRAY_SIZE(xfeature_names)-1;
+		xfeature_idx = min(xfeature_idx, max_idx);
+
+		*feature_name = xfeature_names[xfeature_idx];
+	}
+
+	if (xfeatures_missing)
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
+
+/*
+ * When executing XSAVEOPT (or other optimized XSAVE instructions), if
+ * a processor implementation detects that an FPU state component is still
+ * (or is again) in its initialized state, it may clear the corresponding
+ * bit in the header.xfeatures field, and can skip the writeout of registers
+ * to the corresponding memory layout.
+ *
+ * This means that when the bit is zero, the state component might still contain
+ * some previous - non-initialized register state.
+ *
+ * Before writing xstate information to user-space we sanitize those components,
+ * to always ensure that the memory layout of a feature will be in the init state
+ * if the corresponding header bit is zero. This is to ensure that user-space doesn't
+ * see some stale state in the memory layout during signal handling, debugging etc.
+ */
+void fpstate_sanitize_xstate(struct fpu *fpu)
+{
+	struct fxregs_state *fx = &fpu->state.fxsave;
+	int feature_bit;
+	u64 xfeatures;
+
+	if (!use_xsaveopt())
+		return;
+
+	xfeatures = fpu->state.xsave.header.xfeatures;
+
+	/*
+	 * None of the feature bits are in init state. So nothing else
+	 * to do for us, as the memory layout is up to date.
+	 */
+	if ((xfeatures & xfeatures_mask) == xfeatures_mask)
+		return;
+
+	/*
+	 * FP is in init state
+	 */
+	if (!(xfeatures & XSTATE_FP)) {
+		fx->cwd = 0x37f;
+		fx->swd = 0;
+		fx->twd = 0;
+		fx->fop = 0;
+		fx->rip = 0;
+		fx->rdp = 0;
+		memset(&fx->st_space[0], 0, 128);
+	}
+
+	/*
+	 * SSE is in init state
+	 */
+	if (!(xfeatures & XSTATE_SSE))
+		memset(&fx->xmm_space[0], 0, 256);
+
+	/*
+	 * First two features are FPU and SSE, which above we handled
+	 * in a special way already:
+	 */
+	feature_bit = 0x2;
+	xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
+
+	/*
+	 * Update all the remaining memory layouts according to their
+	 * standard xstate layout, if their header bit is in the init
+	 * state:
+	 */
+	while (xfeatures) {
+		if (xfeatures & 0x1) {
+			int offset = xstate_offsets[feature_bit];
+			int size = xstate_sizes[feature_bit];
+
+			memcpy((void *)fx + offset,
+			       (void *)&init_fpstate.xsave + offset,
+			       size);
+		}
+
+		xfeatures >>= 1;
+		feature_bit++;
+	}
+}
+
+/*
+ * Enable the extended processor state save/restore feature.
+ * Called once per CPU onlining.
+ */
+void fpu__init_cpu_xstate(void)
+{
+	if (!cpu_has_xsave || !xfeatures_mask)
+		return;
+
+	cr4_set_bits(X86_CR4_OSXSAVE);
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Record the offsets and sizes of various xstates contained
+ * in the XSAVE state memory layout.
+ *
+ * ( Note that certain features might be non-present, for them
+ *   we'll have 0 offset and 0 size. )
+ */
+static void __init setup_xstate_features(void)
+{
+	u32 eax, ebx, ecx, edx, leaf;
+
+	xfeatures_nr = fls64(xfeatures_mask);
+
+	for (leaf = 2; leaf < xfeatures_nr; leaf++) {
+		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+
+		xstate_offsets[leaf] = ebx;
+		xstate_sizes[leaf] = eax;
+
+		printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %04x, xstate_sizes[%d]: %04x\n", leaf, ebx, leaf, eax);
+	}
+}
+
+static void __init print_xstate_feature(u64 xstate_mask)
+{
+	const char *feature_name;
+
+	if (cpu_has_xfeatures(xstate_mask, &feature_name))
+		pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+}
+
+/*
+ * Print out all the supported xstate features:
+ */
+static void __init print_xstate_features(void)
+{
+	print_xstate_feature(XSTATE_FP);
+	print_xstate_feature(XSTATE_SSE);
+	print_xstate_feature(XSTATE_YMM);
+	print_xstate_feature(XSTATE_BNDREGS);
+	print_xstate_feature(XSTATE_BNDCSR);
+	print_xstate_feature(XSTATE_OPMASK);
+	print_xstate_feature(XSTATE_ZMM_Hi256);
+	print_xstate_feature(XSTATE_Hi16_ZMM);
+}
+
+/*
+ * This function sets up offsets and sizes of all extended states in
+ * xsave area. This supports both standard format and compacted format
+ * of the xsave aread.
+ */
+static void __init setup_xstate_comp(void)
+{
+	unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8];
+	int i;
+
+	/*
+	 * The FP xstates and SSE xstates are legacy states. They are always
+	 * in the fixed offsets in the xsave area in either compacted form
+	 * or standard form.
+	 */
+	xstate_comp_offsets[0] = 0;
+	xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space);
+
+	if (!cpu_has_xsaves) {
+		for (i = 2; i < xfeatures_nr; i++) {
+			if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+				xstate_comp_offsets[i] = xstate_offsets[i];
+				xstate_comp_sizes[i] = xstate_sizes[i];
+			}
+		}
+		return;
+	}
+
+	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+	for (i = 2; i < xfeatures_nr; i++) {
+		if (test_bit(i, (unsigned long *)&xfeatures_mask))
+			xstate_comp_sizes[i] = xstate_sizes[i];
+		else
+			xstate_comp_sizes[i] = 0;
+
+		if (i > 2)
+			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
+					+ xstate_comp_sizes[i-1];
+
+	}
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_init_fpu_buf(void)
+{
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	if (!cpu_has_xsave)
+		return;
+
+	setup_xstate_features();
+	print_xstate_features();
+
+	if (cpu_has_xsaves) {
+		init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+		init_fpstate.xsave.header.xfeatures = xfeatures_mask;
+	}
+
+	/*
+	 * Init all the features state with header_bv being 0x0
+	 */
+	copy_kernel_to_xregs_booting(&init_fpstate.xsave);
+
+	/*
+	 * Dump the init state again. This is to identify the init state
+	 * of any feature which is not represented by all zero's.
+	 */
+	copy_xregs_to_kernel_booting(&init_fpstate.xsave);
+}
+
+/*
+ * Calculate total size of enabled xstates in XCR0/xfeatures_mask.
+ */
+static void __init init_xstate_size(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	int i;
+
+	if (!cpu_has_xsaves) {
+		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+		xstate_size = ebx;
+		return;
+	}
+
+	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
+	for (i = 2; i < 64; i++) {
+		if (test_bit(i, (unsigned long *)&xfeatures_mask)) {
+			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+			xstate_size += eax;
+		}
+	}
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ * Called once per system bootup.
+ */
+void __init fpu__init_system_xstate(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+	static int on_boot_cpu = 1;
+
+	WARN_ON_FPU(!on_boot_cpu);
+	on_boot_cpu = 0;
+
+	if (!cpu_has_xsave) {
+		pr_info("x86/fpu: Legacy x87 FPU detected.\n");
+		return;
+	}
+
+	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+		WARN_ON_FPU(1);
+		return;
+	}
+
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	xfeatures_mask = eax + ((u64)edx << 32);
+
+	if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+		pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
+		BUG();
+	}
+
+	/* Support only the state known to the OS: */
+	xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+
+	/* Enable xstate instructions to be able to continue with initialization: */
+	fpu__init_cpu_xstate();
+
+	/* Recompute the context size for enabled features: */
+	init_xstate_size();
+
+	update_regset_xstate_info(xstate_size, xfeatures_mask);
+	fpu__init_prepare_fx_sw_frame();
+	setup_init_fpu_buf();
+	setup_xstate_comp();
+
+	pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is 0x%x bytes, using '%s' format.\n",
+		xfeatures_mask,
+		xstate_size,
+		cpu_has_xsaves ? "compacted" : "standard");
+}
+
+/*
+ * Restore minimal FPU state after suspend:
+ */
+void fpu__resume_cpu(void)
+{
+	/*
+	 * Restore XCR0 on xsave capable CPUs:
+	 */
+	if (cpu_has_xsave)
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
+}
+
+/*
+ * Given the xsave area and a state inside, this function returns the
+ * address of the state.
+ *
+ * This is the API that is called to get xstate address in either
+ * standard format or compacted format of xsave area.
+ *
+ * Note that if there is no data for the field in the xsave buffer
+ * this will return NULL.
+ *
+ * Inputs:
+ *	xstate: the thread's storage area for all FPU data
+ *	xstate_feature: state which is defined in xsave.h (e.g.
+ *	XSTATE_FP, XSTATE_SSE, etc...)
+ * Output:
+ *	address of the state in the xsave area, or NULL if the
+ *	field is not present in the xsave buffer.
+ */
+void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
+{
+	int feature_nr = fls64(xstate_feature) - 1;
+	/*
+	 * Do we even *have* xsave state?
+	 */
+	if (!boot_cpu_has(X86_FEATURE_XSAVE))
+		return NULL;
+
+	xsave = &current->thread.fpu.state.xsave;
+	/*
+	 * We should not ever be requesting features that we
+	 * have not enabled.  Remember that pcntxt_mask is
+	 * what we write to the XCR0 register.
+	 */
+	WARN_ONCE(!(xfeatures_mask & xstate_feature),
+		  "get of unsupported state");
+	/*
+	 * This assumes the last 'xsave*' instruction to
+	 * have requested that 'xstate_feature' be saved.
+	 * If it did not, we might be seeing and old value
+	 * of the field in the buffer.
+	 *
+	 * This can happen because the last 'xsave' did not
+	 * request that this feature be saved (unlikely)
+	 * or because the "init optimization" caused it
+	 * to not be saved.
+	 */
+	if (!(xsave->header.xfeatures & xstate_feature))
+		return NULL;
+
+	return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
+
+/*
+ * This wraps up the common operations that need to occur when retrieving
+ * data from xsave state.  It first ensures that the current task was
+ * using the FPU and retrieves the data in to a buffer.  It then calculates
+ * the offset of the requested field in the buffer.
+ *
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ *	@xsave_state: state which is defined in xsave.h (e.g. XSTATE_FP,
+ *	XSTATE_SSE, etc...)
+ * Output:
+ *	address of the state in the xsave area or NULL if the state
+ *	is not present or is in its 'init state'.
+ */
+const void *get_xsave_field_ptr(int xsave_state)
+{
+	struct fpu *fpu = &current->thread.fpu;
+
+	if (!fpu->fpstate_active)
+		return NULL;
+	/*
+	 * fpu__save() takes the CPU's xstate registers
+	 * and saves them off to the 'fpu memory buffer.
+	 */
+	fpu__save(fpu);
+
+	return get_xsave_addr(&fpu->state.xsave, xsave_state);
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2b55ee6db053..5a4668136e98 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	clear_bss();
 
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
-		set_intr_gate(i, early_idt_handlers[i]);
+		set_intr_gate(i, early_idt_handler_array[i]);
 	load_idt((const struct desc_ptr *)&idt_descr);
 
 	copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index d031bad9e07e..0e2d96ffd158 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -62,9 +62,16 @@
 #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
 #endif
 
-/* Number of possible pages in the lowmem region */
-LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
-	
+/*
+ * Number of possible pages in the lowmem region.
+ *
+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
+ * gas warning about overflowing shift count when gas has been compiled
+ * with only a host target support using a 32-bit type for internal
+ * representation.
+ */
+LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
+
 /* Enough space to fit pagetables for the low memory linear map */
 MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
 
@@ -478,21 +485,22 @@ is486:
 __INIT
 setup_once:
 	/*
-	 * Set up a idt with 256 entries pointing to ignore_int,
-	 * interrupt gates. It doesn't actually load idt - that needs
-	 * to be done on each CPU. Interrupts are enabled elsewhere,
-	 * when we can be relatively sure everything is ok.
+	 * Set up a idt with 256 interrupt gates that push zero if there
+	 * is no error code and then jump to early_idt_handler_common.
+	 * It doesn't actually load the idt - that needs to be done on
+	 * each CPU. Interrupts are enabled elsewhere, when we can be
+	 * relatively sure everything is ok.
 	 */
 
 	movl $idt_table,%edi
-	movl $early_idt_handlers,%eax
+	movl $early_idt_handler_array,%eax
 	movl $NUM_EXCEPTION_VECTORS,%ecx
 1:
 	movl %eax,(%edi)
 	movl %eax,4(%edi)
 	/* interrupt gate, dpl=0, present */
 	movl $(0x8E000000 + __KERNEL_CS),2(%edi)
-	addl $9,%eax
+	addl $EARLY_IDT_HANDLER_SIZE,%eax
 	addl $8,%edi
 	loop 1b
 
@@ -524,30 +532,32 @@ setup_once:
 	andl $0,setup_once_ref	/* Once is enough, thanks */
 	ret
 
-ENTRY(early_idt_handlers)
+ENTRY(early_idt_handler_array)
 	# 36(%esp) %eflags
 	# 32(%esp) %cs
 	# 28(%esp) %eip
 	# 24(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.if (EXCEPTION_ERRCODE_MASK >> i) & 1
-	ASM_NOP2
-	.else
+	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
 	pushl $0		# Dummy error code, to make stack frame uniform
 	.endif
 	pushl $i		# 20(%esp) Vector number
-	jmp early_idt_handler
+	jmp early_idt_handler_common
 	i = i + 1
+	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
-ENDPROC(early_idt_handlers)
+ENDPROC(early_idt_handler_array)
 	
-	/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+	/*
+	 * The stack is the hardware frame, an error code or zero, and the
+	 * vector number.
+	 */
 	cld
 
 	cmpl $2,(%esp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 	cmpl $2,%ss:early_recursion_flag
 	je hlt_loop
@@ -600,10 +610,10 @@ ex_entry:
 	pop %ecx
 	pop %eax
 	decl %ss:early_recursion_flag
-is_nmi:
+.Lis_nmi:
 	addl $8,%esp		/* drop vector number and error code */
 	iret
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
 
 /* This is the default interrupt "handler" :-) */
 	ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ae6588b301c2..e5c27f729a38 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -321,30 +321,32 @@ bad_address:
 	jmp bad_address
 
 	__INIT
-	.globl early_idt_handlers
-early_idt_handlers:
+ENTRY(early_idt_handler_array)
 	# 104(%rsp) %rflags
 	#  96(%rsp) %cs
 	#  88(%rsp) %rip
 	#  80(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.if (EXCEPTION_ERRCODE_MASK >> i) & 1
-	ASM_NOP2
-	.else
+	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
 	pushq $0		# Dummy error code, to make stack frame uniform
 	.endif
 	pushq $i		# 72(%rsp) Vector number
-	jmp early_idt_handler
+	jmp early_idt_handler_common
 	i = i + 1
+	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
+ENDPROC(early_idt_handler_array)
 
-/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+	/*
+	 * The stack is the hardware frame, an error code or zero, and the
+	 * vector number.
+	 */
 	cld
 
 	cmpl $2,(%rsp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
@@ -409,10 +411,10 @@ ENTRY(early_idt_handler)
 	popq %rcx
 	popq %rax
 	decl early_recursion_flag(%rip)
-is_nmi:
+.Lis_nmi:
 	addq $16,%rsp		# drop vector number and error code
 	INTERRUPT_RETURN
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
 
 	__INITDATA
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 3acbff4716b0..10757d0a3fcf 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/io.h>
 
+#include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
@@ -305,8 +306,6 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
-static int hpet_setup_msi_irq(unsigned int irq);
-
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 {
@@ -357,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 			hpet_enable_legacy_int();
 		} else {
 			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			hpet_setup_msi_irq(hdev->irq);
+			irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
 			disable_irq(hdev->irq);
 			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 			enable_irq(hdev->irq);
@@ -423,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta,
 
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
+static struct irq_domain *hpet_domain;
 
 void hpet_msi_unmask(struct irq_data *data)
 {
@@ -473,31 +473,6 @@ static int hpet_msi_next_event(unsigned long delta,
 	return hpet_next_event(delta, evt, hdev->num);
 }
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
-		irq_free_hwirq(irq);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int hpet_assign_irq(struct hpet_dev *dev)
-{
-	unsigned int irq = irq_alloc_hwirq(-1);
-
-	if (!irq)
-		return -EINVAL;
-
-	irq_set_handler_data(irq, dev);
-
-	if (hpet_setup_msi_irq(irq))
-		return -EINVAL;
-
-	dev->irq = irq;
-	return 0;
-}
-
 static irqreturn_t hpet_interrupt_handler(int irq, void *data)
 {
 	struct hpet_dev *dev = (struct hpet_dev *)data;
@@ -540,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	if (!(hdev->flags & HPET_DEV_VALID))
 		return;
 
-	if (hpet_setup_msi_irq(hdev->irq))
-		return;
-
 	hdev->cpu = cpu;
 	per_cpu(cpu_hpet_dev, cpu) = hdev;
 	evt->name = hdev->name;
@@ -574,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int id;
 	unsigned int num_timers;
 	unsigned int num_timers_used = 0;
-	int i;
+	int i, irq;
 
 	if (hpet_msi_disable)
 		return;
@@ -587,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	num_timers++; /* Value read out starts from 0 */
 	hpet_print_config();
 
+	hpet_domain = hpet_create_irq_domain(hpet_blockid);
+	if (!hpet_domain)
+		return;
+
 	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
 	if (!hpet_devs)
 		return;
@@ -604,12 +580,14 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 		hdev->flags = 0;
 		if (cfg & HPET_TN_PERIODIC_CAP)
 			hdev->flags |= HPET_DEV_PERI_CAP;
+		sprintf(hdev->name, "hpet%d", i);
 		hdev->num = i;
 
-		sprintf(hdev->name, "hpet%d", i);
-		if (hpet_assign_irq(hdev))
+		irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+		if (irq <= 0)
 			continue;
 
+		hdev->irq = irq;
 		hdev->flags |= HPET_DEV_FSB_CAP;
 		hdev->flags |= HPET_DEV_VALID;
 		num_timers_used++;
@@ -709,10 +687,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 }
 #else
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	return 0;
-}
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 {
 	return;
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 05fd74f537d6..64341aa485ae 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -40,7 +40,5 @@ EXPORT_SYMBOL(empty_zero_page);
 
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
deleted file mode 100644
index 009183276bb7..000000000000
--- a/arch/x86/kernel/i387.c
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
- *  Copyright (C) 1994 Linus Torvalds
- *
- *  Pentium III FXSR, SSE support
- *  General FPU state handling cleanups
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-#include <linux/module.h>
-#include <linux/regset.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#include <asm/sigcontext.h>
-#include <asm/processor.h>
-#include <asm/math_emu.h>
-#include <asm/tlbflush.h>
-#include <asm/uaccess.h>
-#include <asm/ptrace.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/user.h>
-
-static DEFINE_PER_CPU(bool, in_kernel_fpu);
-
-void kernel_fpu_disable(void)
-{
-	WARN_ON(this_cpu_read(in_kernel_fpu));
-	this_cpu_write(in_kernel_fpu, true);
-}
-
-void kernel_fpu_enable(void)
-{
-	this_cpu_write(in_kernel_fpu, false);
-}
-
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
-static inline bool interrupted_kernel_fpu_idle(void)
-{
-	if (this_cpu_read(in_kernel_fpu))
-		return false;
-
-	if (use_eager_fpu())
-		return true;
-
-	return !__thread_has_fpu(current) &&
-		(read_cr0() & X86_CR0_TS);
-}
-
-/*
- * Were we in user mode (or vm86 mode) when we were
- * interrupted?
- *
- * Doing kernel_fpu_begin/end() is ok if we are running
- * in an interrupt context from user mode - we'll just
- * save the FPU state as required.
- */
-static inline bool interrupted_user_mode(void)
-{
-	struct pt_regs *regs = get_irq_regs();
-	return regs && user_mode(regs);
-}
-
-/*
- * Can we use the FPU in kernel mode with the
- * whole "kernel_fpu_begin/end()" sequence?
- *
- * It's always ok in process context (ie "not interrupt")
- * but it is sometimes ok even from an irq.
- */
-bool irq_fpu_usable(void)
-{
-	return !in_interrupt() ||
-		interrupted_user_mode() ||
-		interrupted_kernel_fpu_idle();
-}
-EXPORT_SYMBOL(irq_fpu_usable);
-
-void __kernel_fpu_begin(void)
-{
-	struct task_struct *me = current;
-
-	this_cpu_write(in_kernel_fpu, true);
-
-	if (__thread_has_fpu(me)) {
-		__save_init_fpu(me);
-	} else {
-		this_cpu_write(fpu_owner_task, NULL);
-		if (!use_eager_fpu())
-			clts();
-	}
-}
-EXPORT_SYMBOL(__kernel_fpu_begin);
-
-void __kernel_fpu_end(void)
-{
-	struct task_struct *me = current;
-
-	if (__thread_has_fpu(me)) {
-		if (WARN_ON(restore_fpu_checking(me)))
-			fpu_reset_state(me);
-	} else if (!use_eager_fpu()) {
-		stts();
-	}
-
-	this_cpu_write(in_kernel_fpu, false);
-}
-EXPORT_SYMBOL(__kernel_fpu_end);
-
-void unlazy_fpu(struct task_struct *tsk)
-{
-	preempt_disable();
-	if (__thread_has_fpu(tsk)) {
-		if (use_eager_fpu()) {
-			__save_fpu(tsk);
-		} else {
-			__save_init_fpu(tsk);
-			__thread_fpu_end(tsk);
-		}
-	}
-	preempt_enable();
-}
-EXPORT_SYMBOL(unlazy_fpu);
-
-unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
-unsigned int xstate_size;
-EXPORT_SYMBOL_GPL(xstate_size);
-static struct i387_fxsave_struct fx_scratch;
-
-static void mxcsr_feature_mask_init(void)
-{
-	unsigned long mask = 0;
-
-	if (cpu_has_fxsr) {
-		memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
-		asm volatile("fxsave %0" : "+m" (fx_scratch));
-		mask = fx_scratch.mxcsr_mask;
-		if (mask == 0)
-			mask = 0x0000ffbf;
-	}
-	mxcsr_feature_mask &= mask;
-}
-
-static void init_thread_xstate(void)
-{
-	/*
-	 * Note that xstate_size might be overwriten later during
-	 * xsave_init().
-	 */
-
-	if (!cpu_has_fpu) {
-		/*
-		 * Disable xsave as we do not support it if i387
-		 * emulation is enabled.
-		 */
-		setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-		xstate_size = sizeof(struct i387_soft_struct);
-		return;
-	}
-
-	if (cpu_has_fxsr)
-		xstate_size = sizeof(struct i387_fxsave_struct);
-	else
-		xstate_size = sizeof(struct i387_fsave_struct);
-}
-
-/*
- * Called at bootup to set up the initial FPU state that is later cloned
- * into all processes.
- */
-
-void fpu_init(void)
-{
-	unsigned long cr0;
-	unsigned long cr4_mask = 0;
-
-#ifndef CONFIG_MATH_EMULATION
-	if (!cpu_has_fpu) {
-		pr_emerg("No FPU found and no math emulation present\n");
-		pr_emerg("Giving up\n");
-		for (;;)
-			asm volatile("hlt");
-	}
-#endif
-	if (cpu_has_fxsr)
-		cr4_mask |= X86_CR4_OSFXSR;
-	if (cpu_has_xmm)
-		cr4_mask |= X86_CR4_OSXMMEXCPT;
-	if (cr4_mask)
-		cr4_set_bits(cr4_mask);
-
-	cr0 = read_cr0();
-	cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
-	if (!cpu_has_fpu)
-		cr0 |= X86_CR0_EM;
-	write_cr0(cr0);
-
-	/*
-	 * init_thread_xstate is only called once to avoid overriding
-	 * xstate_size during boot time or during CPU hotplug.
-	 */
-	if (xstate_size == 0)
-		init_thread_xstate();
-
-	mxcsr_feature_mask_init();
-	xsave_init();
-	eager_fpu_init();
-}
-
-void fpu_finit(struct fpu *fpu)
-{
-	if (!cpu_has_fpu) {
-		finit_soft_fpu(&fpu->state->soft);
-		return;
-	}
-
-	memset(fpu->state, 0, xstate_size);
-
-	if (cpu_has_fxsr) {
-		fx_finit(&fpu->state->fxsave);
-	} else {
-		struct i387_fsave_struct *fp = &fpu->state->fsave;
-		fp->cwd = 0xffff037fu;
-		fp->swd = 0xffff0000u;
-		fp->twd = 0xffffffffu;
-		fp->fos = 0xffff0000u;
-	}
-}
-EXPORT_SYMBOL_GPL(fpu_finit);
-
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remember the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
-{
-	int ret;
-
-	if (tsk_used_math(tsk)) {
-		if (cpu_has_fpu && tsk == current)
-			unlazy_fpu(tsk);
-		task_disable_lazy_fpu_restore(tsk);
-		return 0;
-	}
-
-	/*
-	 * Memory allocation at the first usage of the FPU and other state.
-	 */
-	ret = fpu_alloc(&tsk->thread.fpu);
-	if (ret)
-		return ret;
-
-	fpu_finit(&tsk->thread.fpu);
-
-	set_stopped_child_used_math(tsk);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(init_fpu);
-
-/*
- * The xstateregs_active() routine is the same as the fpregs_active() routine,
- * as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
- */
-int fpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return tsk_used_math(target) ? regset->n : 0;
-}
-
-int xfpregs_active(struct task_struct *target, const struct user_regset *regset)
-{
-	return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0;
-}
-
-int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
-{
-	int ret;
-
-	if (!cpu_has_fxsr)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				   &target->thread.fpu.state->fxsave, 0, -1);
-}
-
-int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		const void *kbuf, const void __user *ubuf)
-{
-	int ret;
-
-	if (!cpu_has_fxsr)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.fpu.state->fxsave, 0, -1);
-
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
-	/*
-	 * update the header bits in the xsave header, indicating the
-	 * presence of FP and SSE state.
-	 */
-	if (cpu_has_xsave)
-		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
-
-	return ret;
-}
-
-int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
-		unsigned int pos, unsigned int count,
-		void *kbuf, void __user *ubuf)
-{
-	struct xsave_struct *xsave;
-	int ret;
-
-	if (!cpu_has_xsave)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	xsave = &target->thread.fpu.state->xsave;
-
-	/*
-	 * Copy the 48bytes defined by the software first into the xstate
-	 * memory layout in the thread struct, so that we can copy the entire
-	 * xstateregs to the user using one user_regset_copyout().
-	 */
-	memcpy(&xsave->i387.sw_reserved,
-		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-	/*
-	 * Copy the xstate memory layout.
-	 */
-	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	return ret;
-}
-
-int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
-		  unsigned int pos, unsigned int count,
-		  const void *kbuf, const void __user *ubuf)
-{
-	struct xsave_struct *xsave;
-	int ret;
-
-	if (!cpu_has_xsave)
-		return -ENODEV;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	xsave = &target->thread.fpu.state->xsave;
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-	/*
-	 * mxcsr reserved bits must be masked to zero for security reasons.
-	 */
-	xsave->i387.mxcsr &= mxcsr_feature_mask;
-	xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
-	/*
-	 * These bits must be zero.
-	 */
-	memset(&xsave->xsave_hdr.reserved, 0, 48);
-	return ret;
-}
-
-#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
-
-/*
- * FPU tag word conversions.
- */
-
-static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
-{
-	unsigned int tmp; /* to avoid 16 bit prefixes in the code */
-
-	/* Transform each pair of bits into 01 (valid) or 00 (empty) */
-	tmp = ~twd;
-	tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */
-	/* and move the valid bits to the lower byte. */
-	tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
-	tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
-	tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
-
-	return tmp;
-}
-
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
-#define FP_EXP_TAG_VALID	0
-#define FP_EXP_TAG_ZERO		1
-#define FP_EXP_TAG_SPECIAL	2
-#define FP_EXP_TAG_EMPTY	3
-
-static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
-{
-	struct _fpxreg *st;
-	u32 tos = (fxsave->swd >> 11) & 7;
-	u32 twd = (unsigned long) fxsave->twd;
-	u32 tag;
-	u32 ret = 0xffff0000u;
-	int i;
-
-	for (i = 0; i < 8; i++, twd >>= 1) {
-		if (twd & 0x1) {
-			st = FPREG_ADDR(fxsave, (i - tos) & 7);
-
-			switch (st->exponent & 0x7fff) {
-			case 0x7fff:
-				tag = FP_EXP_TAG_SPECIAL;
-				break;
-			case 0x0000:
-				if (!st->significand[0] &&
-				    !st->significand[1] &&
-				    !st->significand[2] &&
-				    !st->significand[3])
-					tag = FP_EXP_TAG_ZERO;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			default:
-				if (st->significand[3] & 0x8000)
-					tag = FP_EXP_TAG_VALID;
-				else
-					tag = FP_EXP_TAG_SPECIAL;
-				break;
-			}
-		} else {
-			tag = FP_EXP_TAG_EMPTY;
-		}
-		ret |= tag << (2 * i);
-	}
-	return ret;
-}
-
-/*
- * FXSR floating point environment conversions.
- */
-
-void
-convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
-{
-	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-	struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
-	struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
-	int i;
-
-	env->cwd = fxsave->cwd | 0xffff0000u;
-	env->swd = fxsave->swd | 0xffff0000u;
-	env->twd = twd_fxsr_to_i387(fxsave);
-
-#ifdef CONFIG_X86_64
-	env->fip = fxsave->rip;
-	env->foo = fxsave->rdp;
-	/*
-	 * should be actually ds/cs at fpu exception time, but
-	 * that information is not available in 64bit mode.
-	 */
-	env->fcs = task_pt_regs(tsk)->cs;
-	if (tsk == current) {
-		savesegment(ds, env->fos);
-	} else {
-		env->fos = tsk->thread.ds;
-	}
-	env->fos |= 0xffff0000;
-#else
-	env->fip = fxsave->fip;
-	env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
-	env->foo = fxsave->foo;
-	env->fos = fxsave->fos;
-#endif
-
-	for (i = 0; i < 8; ++i)
-		memcpy(&to[i], &from[i], sizeof(to[0]));
-}
-
-void convert_to_fxsr(struct task_struct *tsk,
-		     const struct user_i387_ia32_struct *env)
-
-{
-	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
-	struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
-	struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
-	int i;
-
-	fxsave->cwd = env->cwd;
-	fxsave->swd = env->swd;
-	fxsave->twd = twd_i387_to_fxsr(env->twd);
-	fxsave->fop = (u16) ((u32) env->fcs >> 16);
-#ifdef CONFIG_X86_64
-	fxsave->rip = env->fip;
-	fxsave->rdp = env->foo;
-	/* cs and ds ignored */
-#else
-	fxsave->fip = env->fip;
-	fxsave->fcs = (env->fcs & 0xffff);
-	fxsave->foo = env->foo;
-	fxsave->fos = env->fos;
-#endif
-
-	for (i = 0; i < 8; ++i)
-		memcpy(&to[i], &from[i], sizeof(from[0]));
-}
-
-int fpregs_get(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       void *kbuf, void __user *ubuf)
-{
-	struct user_i387_ia32_struct env;
-	int ret;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
-
-	if (!cpu_has_fxsr)
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.fpu.state->fsave, 0,
-					   -1);
-
-	sanitize_i387_state(target);
-
-	if (kbuf && pos == 0 && count == sizeof(env)) {
-		convert_from_fxsr(kbuf, target);
-		return 0;
-	}
-
-	convert_from_fxsr(&env, target);
-
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-}
-
-int fpregs_set(struct task_struct *target, const struct user_regset *regset,
-	       unsigned int pos, unsigned int count,
-	       const void *kbuf, const void __user *ubuf)
-{
-	struct user_i387_ia32_struct env;
-	int ret;
-
-	ret = init_fpu(target);
-	if (ret)
-		return ret;
-
-	sanitize_i387_state(target);
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
-
-	if (!cpu_has_fxsr)
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.fpu.state->fsave, 0,
-					  -1);
-
-	if (pos > 0 || count < sizeof(env))
-		convert_from_fxsr(&env, target);
-
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
-	if (!ret)
-		convert_to_fxsr(target, &env);
-
-	/*
-	 * update the header bit in the xsave header, indicating the
-	 * presence of FP.
-	 */
-	if (cpu_has_xsave)
-		target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
-	return ret;
-}
-
-/*
- * FPU state for core dumps.
- * This is only used for a.out dumps now.
- * It is declared generically using elf_fpregset_t (which is
- * struct user_i387_struct) but is in fact only used for 32-bit
- * dumps, so on 64-bit it is really struct user_i387_ia32_struct.
- */
-int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
-{
-	struct task_struct *tsk = current;
-	int fpvalid;
-
-	fpvalid = !!used_math();
-	if (fpvalid)
-		fpvalid = !fpregs_get(tsk, NULL,
-				      0, sizeof(struct user_i387_ia32_struct),
-				      fpu, NULL);
-
-	return fpvalid;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-#endif	/* CONFIG_X86_32 || CONFIG_IA32_EMULATION */
-
-static int __init no_387(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FPU);
-	return 1;
-}
-
-__setup("no387", no_387);
-
-void fpu_detect(struct cpuinfo_x86 *c)
-{
-	unsigned long cr0;
-	u16 fsw, fcw;
-
-	fsw = fcw = 0xffff;
-
-	cr0 = read_cr0();
-	cr0 &= ~(X86_CR0_TS | X86_CR0_EM);
-	write_cr0(cr0);
-
-	asm volatile("fninit ; fnstsw %0 ; fnstcw %1"
-		     : "+m" (fsw), "+m" (fcw));
-
-	if (fsw == 0 && (fcw & 0x103f) == 0x003f)
-		set_cpu_cap(c, X86_FEATURE_FPU);
-	else
-		clear_cpu_cap(c, X86_FEATURE_FPU);
-
-	/* The final cr0 value is set in fpu_init() */
-}
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index e7cc5370cd2f..16cb827a5b27 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi)
 	 */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+	outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
 
 	/* 8259A-1 (the master) has a slave on IR2 */
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi)
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 
-	/* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+	outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
 	/* 8259A-2 is a slave on master's IR2 */
 	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	/* (slave's support for AEOI in flat mode is to be investigated) */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e5952c225532..88b366487b0e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,12 @@
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
 
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
 atomic_t irq_err_count;
 
 /* Function pointer for generic interrupt vector handling */
@@ -116,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
 	seq_puts(p, "  Threshold APIC interrupts\n");
 #endif
+#ifdef CONFIG_X86_MCE_AMD
+	seq_printf(p, "%*s: ", prec, "DFR");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+	seq_puts(p, "  Deferred Error APIC interrupts\n");
+#endif
 #ifdef CONFIG_X86_MCE
 	seq_printf(p, "%*s: ", prec, "MCE");
 	for_each_online_cpu(j)
@@ -136,6 +148,18 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 #if defined(CONFIG_X86_IO_APIC)
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 #endif
+#ifdef CONFIG_HAVE_KVM
+	seq_printf(p, "%*s: ", prec, "PIN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+	seq_puts(p, "  Posted-interrupt notification event\n");
+
+	seq_printf(p, "%*s: ", prec, "PIW");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+	seq_puts(p, "  Posted-interrupt wakeup event\n");
+#endif
 	return 0;
 }
 
@@ -192,8 +216,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
-	irq_enter();
-	exit_idle();
+	entering_irq();
 
 	irq = __this_cpu_read(vector_irq[vector]);
 
@@ -209,7 +232,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 		}
 	}
 
-	irq_exit();
+	exiting_irq();
 
 	set_irq_regs(old_regs);
 	return 1;
@@ -237,6 +260,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+	if (handler)
+		kvm_posted_intr_wakeup_handler = handler;
+	else
+		kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
  */
@@ -244,16 +279,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	inc_irq_stat(kvm_posted_intr_ipis);
+	exiting_irq();
+	set_irq_regs(old_regs);
+}
 
-	irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	entering_ack_irq();
+	inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+	kvm_posted_intr_wakeup_handler();
+	exiting_irq();
 	set_irq_regs(old_regs);
 }
 #endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index f9fd86a7fcc7..cd74f5978ab9 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -21,12 +21,6 @@
 
 #include <asm/apic.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
 int sysctl_panic_on_stackoverflow __read_mostly;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 394e643d7830..bc4604e500a3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,12 +20,6 @@
 #include <asm/idle.h>
 #include <asm/apic.h>
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 int sysctl_panic_on_stackoverflow;
 
 /*
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
index 15d741ddfeeb..dc5fa6a1e8d6 100644
--- a/arch/x86/kernel/irq_work.c
+++ b/arch/x86/kernel/irq_work.c
@@ -10,12 +10,6 @@
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
 
-static inline void irq_work_entering_irq(void)
-{
-	irq_enter();
-	ack_APIC_irq();
-}
-
 static inline void __smp_irq_work_interrupt(void)
 {
 	inc_irq_stat(apic_irq_work_irqs);
@@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void)
 
 __visible void smp_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_irq_work_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
 	__smp_irq_work_interrupt();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index cd10a6437264..a3a5e158ed69 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -86,7 +86,7 @@ void __init init_IRQ(void)
 	int i;
 
 	/*
-	 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+	 * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * then this configuration will likely be static after the boot. If
 	 * these IRQ's are handled by more mordern controllers like IO-APIC,
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
 
 	x86_init.irqs.intr_init();
 }
@@ -135,6 +135,10 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
 
+#ifdef CONFIG_X86_MCE_AMD
+	alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
+#endif
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -144,6 +148,8 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_HAVE_KVM
 	/* IPI for KVM to deliver posted interrupt */
 	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+	/* IPI for KVM to deliver interrupt to wake up tasks */
+	alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
 #endif
 
 	/* IPI vectors for APIC spurious and error interrupts */
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9435620062df..47190bd399e7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -331,7 +331,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 	apic_write(APIC_EOI, APIC_EOI_ACK);
 }
 
-void kvm_guest_cpu_init(void)
+static void kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
 		return;
@@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu)
 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
 
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+	unsigned long flags;
+
+	if (in_nmi())
+		return;
+
+	local_irq_save(flags);
+
+	if (READ_ONCE(*ptr) != val)
+		goto out;
+
+	/*
+	 * halt until it's our turn and kicked. Note that we do safe halt
+	 * for irq enabled case to avoid hang when lock info is overwritten
+	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
+	 */
+	if (arch_irqs_disabled_flags(flags))
+		halt();
+	else
+		safe_halt();
+
+out:
+	local_irq_restore(flags);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
 enum kvm_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -655,7 +688,7 @@ static inline void spin_time_accum_blocked(u64 start)
 static struct dentry *d_spin_debug;
 static struct dentry *d_kvm_debug;
 
-struct dentry *kvm_init_debugfs(void)
+static struct dentry *kvm_init_debugfs(void)
 {
 	d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
 	if (!d_kvm_debug)
@@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 	}
 }
 
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	__pv_init_lock_hash();
+	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_lock_ops.wait = kvm_wait;
+	pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
 	pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
 }
 
 static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 42caaef897c8..49487b488061 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/memblock.h>
+#include <linux/sched.h>
 
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
@@ -217,8 +218,10 @@ static void kvm_shutdown(void)
 
 void __init kvmclock_init(void)
 {
+	struct pvclock_vcpu_time_info *vcpu_time;
 	unsigned long mem;
-	int size;
+	int size, cpu;
+	u8 flags;
 
 	size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
 
@@ -264,7 +267,14 @@ void __init kvmclock_init(void)
 	pv_info.name = "KVM";
 
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
-		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+		pvclock_set_flags(~0);
+
+	cpu = get_cpu();
+	vcpu_time = &hv_clock[cpu].pvti;
+	flags = pvclock_read_flags(vcpu_time);
+	if (flags & PVCLOCK_COUNTS_FROM_ZERO)
+		set_sched_clock_stable();
+	put_cpu();
 }
 
 int __init kvm_setup_vsyscall_timeinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 415480d3ea84..819ab3f9c9c7 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include <linux/ftrace.h>
 #include <linux/io.h>
 #include <linux/suspend.h>
+#include <linux/vmalloc.h>
 
 #include <asm/init.h>
 #include <asm/pgtable.h>
@@ -25,6 +26,7 @@
 #include <asm/io_apic.h>
 #include <asm/debugreg.h>
 #include <asm/kexec-bzimage64.h>
+#include <asm/setup.h>
 
 #ifdef CONFIG_KEXEC_FILE
 static struct kexec_file_ops *kexec_file_loaders[] = {
@@ -334,7 +336,7 @@ void arch_crash_save_vmcoreinfo(void)
 	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
 #endif
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
-			      (unsigned long)&_text - __START_KERNEL);
+			      kaslr_offset());
 }
 
 /* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 2d2a237f2c73..30ca7607cbbb 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -19,8 +19,8 @@
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
@@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 	struct ioapic_domain_cfg cfg = {
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c7316341..33ee3e0efd65 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@
 
 #include <asm/paravirt.h>
 
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+	native_queued_spin_unlock(lock);
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+	return pv_lock_ops.queued_spin_unlock.func ==
+		__raw_callee_save___native_queued_spin_unlock;
+}
+#endif
+
 struct pv_lock_ops pv_lock_ops = {
 #ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+	.wait = paravirt_nop,
+	.kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
 	.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
 	.unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c614dd492f5f..58bcfb67c01f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+#ifdef CONFIG_X86_32
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+#endif
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
@@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 
 	.load_sp0 = native_load_sp0,
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_32)
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 #endif
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab6..e1b013696dde 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	/* arg in %eax, return in %eax */
@@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 	return 0;
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {
@@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_cpu_ops, read_tsc);
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+			if (pv_is_native_spin_unlock()) {
+				start = start_pv_lock_ops_queued_spin_unlock;
+				end   = end_pv_lock_ops_queued_spin_unlock;
+				goto patch_site;
+			}
+#endif
 
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
+
+patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
 	}
 #undef PATCH_SITE
 	return ret;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index a1da6737ba5b..8aa05583bc42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
 DEF_NATIVE(, mov32, "mov %edi, %eax");
 DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
+
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
 	return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
 				    start__mov64, end__mov64);
 }
 
+extern bool pv_is_native_spin_unlock(void);
+
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
 {
@@ -49,7 +55,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
-		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, swapgs);
@@ -59,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_cpu_ops, clts);
 		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
-
-	patch_site:
-		ret = paravirt_patch_insns(ibuf, len, start, end);
-		break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+			if (pv_is_native_spin_unlock()) {
+				start = start_pv_lock_ops_queued_spin_unlock;
+				end   = end_pv_lock_ops_queued_spin_unlock;
+				goto patch_site;
+			}
+#endif
 
 	default:
 		ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
 		break;
+
+patch_site:
+		ret = paravirt_patch_insns(ibuf, len, start, end);
+		break;
 	}
 #undef PATCH_SITE
 	return ret;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index a25e202bb319..353972c1946c 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -140,6 +140,51 @@ void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
 		free_pages((unsigned long)vaddr, get_order(size));
 }
 
+void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		      gfp_t gfp, struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+	void *memory;
+
+	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+	if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
+		return memory;
+
+	if (!dev)
+		dev = &x86_dma_fallback_dev;
+
+	if (!is_device_dma_capable(dev))
+		return NULL;
+
+	if (!ops->alloc)
+		return NULL;
+
+	memory = ops->alloc(dev, size, dma_handle,
+			    dma_alloc_coherent_gfp_flags(dev, gfp), attrs);
+	debug_dma_alloc_coherent(dev, size, *dma_handle, memory);
+
+	return memory;
+}
+EXPORT_SYMBOL(dma_alloc_attrs);
+
+void dma_free_attrs(struct device *dev, size_t size,
+		    void *vaddr, dma_addr_t bus,
+		    struct dma_attrs *attrs)
+{
+	struct dma_map_ops *ops = get_dma_ops(dev);
+
+	WARN_ON(irqs_disabled());       /* for portability */
+
+	if (dma_release_from_coherent(dev, get_order(size), vaddr))
+		return;
+
+	debug_dma_free_coherent(dev, size, vaddr, bus);
+	if (ops->free)
+		ops->free(dev, size, vaddr, bus, attrs);
+}
+EXPORT_SYMBOL(dma_free_attrs);
+
 /*
  * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel
  * parameter documentation.
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 77dd0ad58be4..adf0392d549a 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -20,6 +20,13 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 {
 	void *vaddr;
 
+	/*
+	 * Don't print a warning when the first allocation attempt fails.
+	 * swiotlb_alloc_coherent() will print a warning when the DMA
+	 * memory allocation ultimately failed.
+	 */
+	flags |= __GFP_NOWARN;
+
 	vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags,
 					   attrs);
 	if (vaddr)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6e338e3b1dc0..9cad694ed7c4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -25,8 +25,7 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/mwait.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
@@ -76,9 +75,6 @@ void idle_notifier_unregister(struct notifier_block *n)
 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 #endif
 
-struct kmem_cache *task_xstate_cachep;
-EXPORT_SYMBOL_GPL(task_xstate_cachep);
-
 /*
  * this gets called so that we can store lazy state into memory and copy the
  * current task into the new thread.
@@ -87,36 +83,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
 
-	dst->thread.fpu_counter = 0;
-	dst->thread.fpu.has_fpu = 0;
-	dst->thread.fpu.state = NULL;
-	task_disable_lazy_fpu_restore(dst);
-	if (tsk_used_math(src)) {
-		int err = fpu_alloc(&dst->thread.fpu);
-		if (err)
-			return err;
-		fpu_copy(dst, src);
-	}
-	return 0;
-}
-
-void free_thread_xstate(struct task_struct *tsk)
-{
-	fpu_free(&tsk->thread.fpu);
-}
-
-void arch_release_task_struct(struct task_struct *tsk)
-{
-	free_thread_xstate(tsk);
-}
-
-void arch_task_cache_init(void)
-{
-        task_xstate_cachep =
-        	kmem_cache_create("task_xstate", xstate_size,
-				  __alignof__(union thread_xstate),
-				  SLAB_PANIC | SLAB_NOTRACK, NULL);
-	setup_xstate_comp();
+	return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
 }
 
 /*
@@ -127,6 +94,7 @@ void exit_thread(void)
 	struct task_struct *me = current;
 	struct thread_struct *t = &me->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
+	struct fpu *fpu = &t->fpu;
 
 	if (bp) {
 		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
@@ -142,7 +110,7 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
-	drop_fpu(me);
+	fpu__drop(fpu);
 }
 
 void flush_thread(void)
@@ -152,19 +120,7 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	if (!use_eager_fpu()) {
-		/* FPU state will be reallocated lazily at the first use. */
-		drop_fpu(tsk);
-		free_thread_xstate(tsk);
-	} else {
-		if (!tsk_used_math(tsk)) {
-			/* kthread execs. TODO: cleanup this horror. */
-			if (WARN_ON(init_fpu(tsk)))
-				force_sig(SIGKILL, tsk);
-			user_fpu_begin();
-		}
-		restore_init_xstate();
-	}
+	fpu__clear(&tsk->thread.fpu);
 }
 
 static void hard_disable_TSC(void)
@@ -445,11 +401,10 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
 }
 
 /*
- * MONITOR/MWAIT with no hints, used for default default C1 state.
- * This invokes MWAIT with interrutps enabled and no flags,
- * which is backwards compatible with the original MWAIT implementation.
+ * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
+ * with interrupts enabled and no flags, which is backwards compatible with the
+ * original MWAIT implementation.
  */
-
 static void mwait_idle(void)
 {
 	if (!current_set_polling_and_test()) {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8ed2106b06da..c09c99ccf3e3 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -39,8 +39,7 @@
 #include <asm/pgtable.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/desc.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
@@ -242,14 +241,16 @@ __visible __notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+			     *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
 
-	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
@@ -296,19 +297,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Leave lazy mode, flushing any hypercalls made here.
 	 * This must be done before restoring TLS segments so
 	 * the GDT and LDT are properly updated, and must be
-	 * done before math_state_restore, so the TS bit is up
+	 * done before fpu__restore(), so the TS bit is up
 	 * to date.
 	 */
 	arch_end_context_switch(next_p);
 
 	/*
-	 * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+	 * Reload esp0 and cpu_current_top_of_stack.  This changes
 	 * current_thread_info().
 	 */
 	load_sp0(tss, next);
-	this_cpu_write(kernel_stack,
-		       (unsigned long)task_stack_page(next_p) +
-		       THREAD_SIZE);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
@@ -319,7 +317,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		lazy_load_gs(next->gs);
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	this_cpu_write(current_task, next_p);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ddfdbf74f174..843f92e4c711 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -38,8 +38,7 @@
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mmu_context.h>
 #include <asm/prctl.h>
 #include <asm/desc.h>
@@ -274,12 +273,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
+	struct fpu *prev_fpu = &prev->fpu;
+	struct fpu *next_fpu = &next->fpu;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned fsindex, gsindex;
-	fpu_switch_t fpu;
+	fpu_switch_t fpu_switch;
 
-	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
+	fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
 
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
@@ -299,7 +300,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Leave lazy mode, flushing any hypercalls made here.  This
 	 * must be done after loading TLS entries in the GDT but before
 	 * loading segments that might reference them, and and it must
-	 * be done before math_state_restore, so the TS bit is up to
+	 * be done before fpu__restore(), so the TS bit is up to
 	 * date.
 	 */
 	arch_end_context_switch(next_p);
@@ -391,7 +392,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
 	prev->gsindex = gsindex;
 
-	switch_fpu_finish(next_p, fpu);
+	switch_fpu_finish(next_fpu, fpu_switch);
 
 	/*
 	 * Switch the PDA and FPU contexts.
@@ -409,9 +410,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
 	load_sp0(tss, next);
 
-	this_cpu_write(kernel_stack,
-		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
-
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a7bc79480719..9be72bc3613f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -11,7 +11,6 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/ptrace.h>
-#include <linux/regset.h>
 #include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/elf.h>
@@ -28,8 +27,9 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
+#include <asm/fpu/regset.h>
 #include <asm/debugreg.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -1297,7 +1297,7 @@ static struct user_regset x86_64_regsets[] __read_mostly = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_struct) / sizeof(long),
 		.size = sizeof(long), .align = sizeof(long),
-		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
@@ -1338,13 +1338,13 @@ static struct user_regset x86_32_regsets[] __read_mostly = {
 		.core_note_type = NT_PRFPREG,
 		.n = sizeof(struct user_i387_ia32_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = fpregs_active, .get = fpregs_get, .set = fpregs_set
+		.active = regset_fpregs_active, .get = fpregs_get, .set = fpregs_set
 	},
 	[REGSET_XFP] = {
 		.core_note_type = NT_PRXFPREG,
 		.n = sizeof(struct user32_fxsr_struct) / sizeof(u32),
 		.size = sizeof(u32), .align = sizeof(u32),
-		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
+		.active = regset_xregset_fpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
 	[REGSET_XSTATE] = {
 		.core_note_type = NT_X86_XSTATE,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d74ac33290ae..39ca113676fe 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -531,12 +531,14 @@ static void __init reserve_crashkernel_low(void)
 	if (ret != 0) {
 		/*
 		 * two parts from lib/swiotlb.c:
-		 *	swiotlb size: user specified with swiotlb= or default.
-		 *	swiotlb overflow buffer: now is hardcoded to 32k.
-		 *		We round it to 8M for other buffers that
-		 *		may need to stay low too.
+		 * -swiotlb size: user-specified with swiotlb= or default.
+		 *
+		 * -swiotlb overflow buffer: now hardcoded to 32k. We round it
+		 * to 8M for other buffers that may need to stay low too. Also
+		 * make sure we allocate enough extra low memory so that we
+		 * don't run out of DMA buffers for 32-bit devices.
 		 */
-		low_size = swiotlb_size_or_default() + (8UL<<20);
+		low_size = max(swiotlb_size_or_default() + (8UL<<20), 256UL<<20);
 		auto_set = true;
 	} else {
 		/* passed with crashkernel=0,low ? */
@@ -834,7 +836,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
 {
 	if (kaslr_enabled()) {
 		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
-			 (unsigned long)&_text - __START_KERNEL,
+			 kaslr_offset(),
 			 __START_KERNEL,
 			 __START_KERNEL_map,
 			 MODULES_VADDR-1);
@@ -1222,8 +1224,7 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 
 	init_apic_mappings();
-	if (x86_io_apic_ops.init)
-		x86_io_apic_ops.init();
+	io_apic_init_mappings();
 
 	kvm_guest_init();
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 1ea14fd53933..206996c1669d 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -26,8 +26,8 @@
 
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+#include <asm/fpu/signal.h>
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
@@ -103,7 +103,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 		get_user_ex(buf, &sc->fpstate);
 	} get_user_catch(err);
 
-	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+	err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32));
 
 	force_iret();
 
@@ -199,6 +199,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 	unsigned long sp = regs->sp;
 	unsigned long buf_fx = 0;
 	int onsigstack = on_sig_stack(sp);
+	struct fpu *fpu = &current->thread.fpu;
 
 	/* redzone */
 	if (config_enabled(CONFIG_X86_64))
@@ -218,9 +219,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		}
 	}
 
-	if (used_math()) {
-		sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
-				     &buf_fx, &math_size);
+	if (fpu->fpstate_active) {
+		sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32),
+					  &buf_fx, &math_size);
 		*fpstate = (void __user *)sp;
 	}
 
@@ -234,8 +235,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
 		return (void __user *)-1L;
 
 	/* save i387 and extended state */
-	if (used_math() &&
-	    save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
+	if (fpu->fpstate_active &&
+	    copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
 		return (void __user *)-1L;
 
 	return (void __user *)sp;
@@ -593,6 +594,22 @@ badframe:
 	return 0;
 }
 
+static inline int is_ia32_compat_frame(void)
+{
+	return config_enabled(CONFIG_IA32_EMULATION) &&
+	       test_thread_flag(TIF_IA32);
+}
+
+static inline int is_ia32_frame(void)
+{
+	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+}
+
+static inline int is_x32_frame(void)
+{
+	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+}
+
 static int
 setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 {
@@ -617,6 +634,7 @@ static void
 handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
 	bool stepping, failed;
+	struct fpu *fpu = &current->thread.fpu;
 
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
@@ -665,8 +683,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		/*
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
-		if (used_math())
-			fpu_reset_state(current);
+		if (fpu->fpstate_active)
+			fpu__clear(fpu);
 	}
 	signal_setup_done(failed, ksig, stepping);
 }
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index be8e1bde07aa..15aaa69bbb5e 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 
 asmlinkage __visible void smp_reboot_interrupt(void)
 {
-	ack_APIC_irq();
-	irq_enter();
+	ipi_entering_ack_irq();
 	stop_this_cpu(NULL);
 	irq_exit();
 }
@@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 }
 
-static inline void smp_entering_irq(void)
-{
-	ack_APIC_irq();
-	irq_enter();
-}
-
 __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 	/*
@@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 	 * scheduler_ipi(). This is OK, since those functions are allowed
 	 * to nest.
 	 */
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
 	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
@@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void)
 
 __visible void smp_call_function_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void)
 
 __visible void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 }
 
 __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 50e547eac8cd..8add66b22f33 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -68,8 +68,7 @@
 #include <asm/mwait.h>
 #include <asm/apic.h>
 #include <asm/io_apic.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
@@ -314,10 +313,10 @@ topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
 		cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2));
 }
 
-#define link_mask(_m, c1, c2)						\
+#define link_mask(mfunc, c1, c2)					\
 do {									\
-	cpumask_set_cpu((c1), cpu_##_m##_mask(c2));			\
-	cpumask_set_cpu((c2), cpu_##_m##_mask(c1));			\
+	cpumask_set_cpu((c1), mfunc(c2));				\
+	cpumask_set_cpu((c2), mfunc(c1));				\
 } while (0)
 
 static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
@@ -398,9 +397,9 @@ void set_cpu_sibling_map(int cpu)
 	cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
 
 	if (!has_mp) {
-		cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+		cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
 		cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
-		cpumask_set_cpu(cpu, cpu_core_mask(cpu));
+		cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
 		c->booted_cores = 1;
 		return;
 	}
@@ -409,32 +408,34 @@ void set_cpu_sibling_map(int cpu)
 		o = &cpu_data(i);
 
 		if ((i == cpu) || (has_smt && match_smt(c, o)))
-			link_mask(sibling, cpu, i);
+			link_mask(topology_sibling_cpumask, cpu, i);
 
 		if ((i == cpu) || (has_mp && match_llc(c, o)))
-			link_mask(llc_shared, cpu, i);
+			link_mask(cpu_llc_shared_mask, cpu, i);
 
 	}
 
 	/*
 	 * This needs a separate iteration over the cpus because we rely on all
-	 * cpu_sibling_mask links to be set-up.
+	 * topology_sibling_cpumask links to be set-up.
 	 */
 	for_each_cpu(i, cpu_sibling_setup_mask) {
 		o = &cpu_data(i);
 
 		if ((i == cpu) || (has_mp && match_die(c, o))) {
-			link_mask(core, cpu, i);
+			link_mask(topology_core_cpumask, cpu, i);
 
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
-			if (cpumask_weight(cpu_sibling_mask(cpu)) == 1) {
+			if (cpumask_weight(
+			    topology_sibling_cpumask(cpu)) == 1) {
 				/*
 				 * for each core in package, increment
 				 * the booted_cores for this new cpu
 				 */
-				if (cpumask_first(cpu_sibling_mask(i)) == i)
+				if (cpumask_first(
+				    topology_sibling_cpumask(i)) == i)
 					c->booted_cores++;
 				/*
 				 * increment the core count for all
@@ -514,6 +515,40 @@ void __inquire_remote_apic(int apicid)
 }
 
 /*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UDELAY_10MS_DEFAULT;
+
+static int __init cpu_init_udelay(char *str)
+{
+	get_option(&str, &init_udelay);
+
+	return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+	/* if cmdline changed it from default, leave it alone */
+	if (init_udelay != UDELAY_10MS_DEFAULT)
+		return;
+
+	/* if modern processor, use no delay */
+	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+		init_udelay = 0;
+}
+
+/*
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
@@ -555,7 +590,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 static int
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
-	unsigned long send_status, accept_status = 0;
+	unsigned long send_status = 0, accept_status = 0;
 	int maxlvt, num_starts, j;
 
 	maxlvt = lapic_get_maxlvt();
@@ -583,7 +618,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 
-	mdelay(10);
+	udelay(init_udelay);
 
 	pr_debug("Deasserting INIT\n");
 
@@ -651,6 +686,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		 * Give the other CPU some time to accept the IPI.
 		 */
 		udelay(200);
+
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
@@ -792,8 +828,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
 #endif
-	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 }
 
 /*
@@ -1009,8 +1043,8 @@ static __init void disable_smp(void)
 		physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
 	else
 		physid_set_mask_of_physid(0, &phys_cpu_present_map);
-	cpumask_set_cpu(0, cpu_sibling_mask(0));
-	cpumask_set_cpu(0, cpu_core_mask(0));
+	cpumask_set_cpu(0, topology_sibling_cpumask(0));
+	cpumask_set_cpu(0, topology_core_cpumask(0));
 }
 
 enum {
@@ -1176,6 +1210,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		uv_system_init();
 
 	set_mtrr_aps_delayed_init();
+
+	smp_quirk_init_udelay();
 }
 
 void arch_enable_nonboot_cpus_begin(void)
@@ -1293,22 +1329,22 @@ static void remove_siblinginfo(int cpu)
 	int sibling;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	for_each_cpu(sibling, cpu_core_mask(cpu)) {
-		cpumask_clear_cpu(cpu, cpu_core_mask(sibling));
+	for_each_cpu(sibling, topology_core_cpumask(cpu)) {
+		cpumask_clear_cpu(cpu, topology_core_cpumask(sibling));
 		/*/
 		 * last thread sibling in this cpu core going down
 		 */
-		if (cpumask_weight(cpu_sibling_mask(cpu)) == 1)
+		if (cpumask_weight(topology_sibling_cpumask(cpu)) == 1)
 			cpu_data(sibling).booted_cores--;
 	}
 
-	for_each_cpu(sibling, cpu_sibling_mask(cpu))
-		cpumask_clear_cpu(cpu, cpu_sibling_mask(sibling));
+	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
+		cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
 	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
 		cpumask_clear_cpu(cpu, cpu_llc_shared_mask(sibling));
 	cpumask_clear(cpu_llc_shared_mask(cpu));
-	cpumask_clear(cpu_sibling_mask(cpu));
-	cpumask_clear(cpu_core_mask(cpu));
+	cpumask_clear(topology_sibling_cpumask(cpu));
+	cpumask_clear(topology_core_cpumask(cpu));
 	c->phys_proc_id = 0;
 	c->cpu_core_id = 0;
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 324ab5247687..f5791927aa64 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -54,12 +54,13 @@
 #include <asm/ftrace.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
 #include <asm/mce.h>
 #include <asm/fixmap.h>
 #include <asm/mach_traps.h>
 #include <asm/alternative.h>
+#include <asm/fpu/xstate.h>
+#include <asm/trace/mpx.h>
 #include <asm/mpx.h>
 
 #ifdef CONFIG_X86_64
@@ -72,8 +73,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
 #else
 #include <asm/processor-flags.h>
 #include <asm/setup.h>
-
-asmlinkage int system_call(void);
+#include <asm/proto.h>
 #endif
 
 /* Must be page-aligned because the real IDT is used in a fixmap. */
@@ -371,10 +371,8 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	struct task_struct *tsk = current;
-	struct xsave_struct *xsave_buf;
 	enum ctx_state prev_state;
-	struct bndcsr *bndcsr;
+	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
 	prev_state = exception_enter();
@@ -393,15 +391,15 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 
 	/*
 	 * We need to look at BNDSTATUS to resolve this exception.
-	 * It is not directly accessible, though, so we need to
-	 * do an xsave and then pull it out of the xsave buffer.
+	 * A NULL here might mean that it is in its 'init state',
+	 * which is all zeros which indicates MPX was not
+	 * responsible for the exception.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	xsave_buf = &(tsk->thread.fpu.state->xsave);
-	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
 	if (!bndcsr)
 		goto exit_trap;
 
+	trace_bounds_exception_mpx(bndcsr);
 	/*
 	 * The error code field of the BNDSTATUS register communicates status
 	 * information of a bound range exception #BR or operation involving
@@ -409,11 +407,11 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	 */
 	switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
 	case 2:	/* Bound directory has invalid entry. */
-		if (mpx_handle_bd_fault(xsave_buf))
+		if (mpx_handle_bd_fault())
 			goto exit_trap;
 		break; /* Success, it was handled */
 	case 1: /* Bound violation. */
-		info = mpx_generate_siginfo(regs, xsave_buf);
+		info = mpx_generate_siginfo(regs);
 		if (IS_ERR(info)) {
 			/*
 			 * We failed to decode the MPX instruction.  Act as if
@@ -709,8 +707,8 @@ NOKPROBE_SYMBOL(do_debug);
 static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 {
 	struct task_struct *task = current;
+	struct fpu *fpu = &task->thread.fpu;
 	siginfo_t info;
-	unsigned short err;
 	char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" :
 						"simd exception";
 
@@ -718,8 +716,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 		return;
 	conditional_sti(regs);
 
-	if (!user_mode(regs))
-	{
+	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
 			task->thread.trap_nr = trapnr;
@@ -731,62 +728,20 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	unlazy_fpu(task);
-	task->thread.trap_nr = trapnr;
+	fpu__save(fpu);
+
+	task->thread.trap_nr	= trapnr;
 	task->thread.error_code = error_code;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);
-	if (trapnr == X86_TRAP_MF) {
-		unsigned short cwd, swd;
-		/*
-		 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-		 * status.  0x3f is the exception bits in these regs, 0x200 is the
-		 * C1 reg you need in case of a stack fault, 0x040 is the stack
-		 * fault bit.  We should only be taking one exception at a time,
-		 * so if this combination doesn't produce any single exception,
-		 * then we have a bad program that isn't synchronizing its FPU usage
-		 * and it will suffer the consequences since we won't be able to
-		 * fully reproduce the context of the exception
-		 */
-		cwd = get_fpu_cwd(task);
-		swd = get_fpu_swd(task);
+	info.si_signo		= SIGFPE;
+	info.si_errno		= 0;
+	info.si_addr		= (void __user *)uprobe_get_trap_addr(regs);
 
-		err = swd & ~cwd;
-	} else {
-		/*
-		 * The SIMD FPU exceptions are handled a little differently, as there
-		 * is only a single status/control register.  Thus, to determine which
-		 * unmasked exception was caught we must mask the exception mask bits
-		 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-		 */
-		unsigned short mxcsr = get_fpu_mxcsr(task);
-		err = ~(mxcsr >> 7) & mxcsr;
-	}
+	info.si_code = fpu__exception_code(fpu, trapnr);
 
-	if (err & 0x001) {	/* Invalid op */
-		/*
-		 * swd & 0x240 == 0x040: Stack Underflow
-		 * swd & 0x240 == 0x240: Stack Overflow
-		 * User must clear the SF bit (0x40) if set
-		 */
-		info.si_code = FPE_FLTINV;
-	} else if (err & 0x004) { /* Divide by Zero */
-		info.si_code = FPE_FLTDIV;
-	} else if (err & 0x008) { /* Overflow */
-		info.si_code = FPE_FLTOVF;
-	} else if (err & 0x012) { /* Denormal, Underflow */
-		info.si_code = FPE_FLTUND;
-	} else if (err & 0x020) { /* Precision */
-		info.si_code = FPE_FLTRES;
-	} else {
-		/*
-		 * If we're using IRQ 13, or supposedly even some trap
-		 * X86_TRAP_MF implementations, it's possible
-		 * we get a spurious trap, which is not an error.
-		 */
+	/* Retry when we get spurious exceptions: */
+	if (!info.si_code)
 		return;
-	}
+
 	force_sig_info(SIGFPE, &info, task);
 }
 
@@ -813,62 +768,8 @@ dotraplinkage void
 do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
 	conditional_sti(regs);
-#if 0
-	/* No need to warn about this any longer. */
-	pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
 }
 
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
-}
-
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- *
- * Must be called with kernel preemption disabled (eg with local
- * local interrupts as in the case of do_device_not_available).
- */
-void math_state_restore(void)
-{
-	struct task_struct *tsk = current;
-
-	if (!tsk_used_math(tsk)) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(tsk)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
-
-	/* Avoid __kernel_fpu_begin() right after __thread_fpu_begin() */
-	kernel_fpu_disable();
-	__thread_fpu_begin(tsk);
-	if (unlikely(restore_fpu_checking(tsk))) {
-		fpu_reset_state(tsk);
-		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-	} else {
-		tsk->thread.fpu_counter++;
-	}
-	kernel_fpu_enable();
-}
-EXPORT_SYMBOL_GPL(math_state_restore);
-
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
@@ -889,7 +790,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 		return;
 	}
 #endif
-	math_state_restore(); /* interrupts still off */
+	fpu__restore(&current->thread.fpu); /* interrupts still off */
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
@@ -992,13 +893,13 @@ void __init trap_init(void)
 		set_bit(i, used_vectors);
 
 #ifdef CONFIG_IA32_EMULATION
-	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+	set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_32
-	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-	set_bit(SYSCALL_VECTOR, used_vectors);
+	set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 	/*
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 26488487bc61..dd8d0791dfb5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -113,7 +113,7 @@ static void check_tsc_warp(unsigned int timeout)
  */
 static inline unsigned int loop_timeout(int cpu)
 {
-	return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
+	return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20;
 }
 
 /*
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 0b81ad67da07..66476244731e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -29,6 +29,7 @@
 #include <linux/kdebug.h>
 #include <asm/processor.h>
 #include <asm/insn.h>
+#include <asm/mmu_context.h>
 
 /* Post-execution fixups. */
 
@@ -312,11 +313,6 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
 }
 
 #ifdef CONFIG_X86_64
-static inline bool is_64bit_mm(struct mm_struct *mm)
-{
-	return	!config_enabled(CONFIG_IA32_EMULATION) ||
-		!(mm->context.ia32_compat == TIF_IA32);
-}
 /*
  * If arch_uprobe->insn doesn't use rip-relative addressing, return
  * immediately.  Otherwise, rewrite the instruction so that it accesses
@@ -497,10 +493,6 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
 	}
 }
 #else /* 32-bit: */
-static inline bool is_64bit_mm(struct mm_struct *mm)
-{
-	return false;
-}
 /*
  * No RIP-relative addressing on 32-bit
  */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 37d8fa4438f0..a0695be19864 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -75,7 +75,5 @@ EXPORT_SYMBOL(native_load_gs_index);
 
 #ifdef CONFIG_PREEMPT
 EXPORT_SYMBOL(___preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
-EXPORT_SYMBOL(___preempt_schedule_context);
-#endif
+EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 234b0722de53..3839628d962e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -11,7 +11,6 @@
 #include <asm/bios_ebda.h>
 #include <asm/paravirt.h>
 #include <asm/pci_x86.h>
-#include <asm/pci.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
@@ -111,11 +110,9 @@ EXPORT_SYMBOL_GPL(x86_platform);
 #if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi = {
 	.setup_msi_irqs		= native_setup_msi_irqs,
-	.compose_msi_msg	= native_compose_msi_msg,
 	.teardown_msi_irq	= native_teardown_msi_irq,
 	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
-	.setup_hpet_msi		= default_setup_hpet_msi,
 };
 
 /* MSI arch specific hooks */
@@ -141,13 +138,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 #endif
 
 struct x86_io_apic_ops x86_io_apic_ops = {
-	.init			= native_io_apic_init_mappings,
 	.read			= native_io_apic_read,
-	.write			= native_io_apic_write,
-	.modify			= native_io_apic_modify,
 	.disable		= native_disable_io_apic,
-	.print_entries		= native_io_apic_print_entries,
-	.set_affinity		= native_ioapic_set_affinity,
-	.setup_entry		= native_setup_ioapic_entry,
-	.eoi_ioapic_pin		= native_eoi_ioapic_pin,
 };
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
deleted file mode 100644
index 87a815b85f3e..000000000000
--- a/arch/x86/kernel/xsave.c
+++ /dev/null
@@ -1,724 +0,0 @@
-/*
- * xsave/xrstor support.
- *
- * Author: Suresh Siddha <suresh.b.siddha@intel.com>
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/bootmem.h>
-#include <linux/compat.h>
-#include <linux/cpu.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h>
-#include <asm/sigframe.h>
-#include <asm/tlbflush.h>
-#include <asm/xcr.h>
-
-/*
- * Supported feature mask by the CPU and the kernel.
- */
-u64 pcntxt_mask;
-
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
-static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
-static unsigned int *xstate_offsets, *xstate_sizes;
-static unsigned int xstate_comp_offsets[sizeof(pcntxt_mask)*8];
-static unsigned int xstate_features;
-
-/*
- * If a processor implementation discern that a processor state component is
- * in its initialized state it may modify the corresponding bit in the
- * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
- * layout in the case of xsaveopt. While presenting the xstate information to
- * the user, we always ensure that the memory layout of a feature will be in
- * the init state if the corresponding header bit is zero. This is to ensure
- * that the user doesn't see some stale state in the memory layout during
- * signal handling, debugging etc.
- */
-void __sanitize_i387_state(struct task_struct *tsk)
-{
-	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
-	int feature_bit = 0x2;
-	u64 xstate_bv;
-
-	if (!fx)
-		return;
-
-	xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
-
-	/*
-	 * None of the feature bits are in init state. So nothing else
-	 * to do for us, as the memory layout is up to date.
-	 */
-	if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
-		return;
-
-	/*
-	 * FP is in init state
-	 */
-	if (!(xstate_bv & XSTATE_FP)) {
-		fx->cwd = 0x37f;
-		fx->swd = 0;
-		fx->twd = 0;
-		fx->fop = 0;
-		fx->rip = 0;
-		fx->rdp = 0;
-		memset(&fx->st_space[0], 0, 128);
-	}
-
-	/*
-	 * SSE is in init state
-	 */
-	if (!(xstate_bv & XSTATE_SSE))
-		memset(&fx->xmm_space[0], 0, 256);
-
-	xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
-
-	/*
-	 * Update all the other memory layouts for which the corresponding
-	 * header bit is in the init state.
-	 */
-	while (xstate_bv) {
-		if (xstate_bv & 0x1) {
-			int offset = xstate_offsets[feature_bit];
-			int size = xstate_sizes[feature_bit];
-
-			memcpy(((void *) fx) + offset,
-			       ((void *) init_xstate_buf) + offset,
-			       size);
-		}
-
-		xstate_bv >>= 1;
-		feature_bit++;
-	}
-}
-
-/*
- * Check for the presence of extended state information in the
- * user fpstate pointer in the sigcontext.
- */
-static inline int check_for_xstate(struct i387_fxsave_struct __user *buf,
-				   void __user *fpstate,
-				   struct _fpx_sw_bytes *fx_sw)
-{
-	int min_xstate_size = sizeof(struct i387_fxsave_struct) +
-			      sizeof(struct xsave_hdr_struct);
-	unsigned int magic2;
-
-	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw)))
-		return -1;
-
-	/* Check for the first magic field and other error scenarios. */
-	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 ||
-	    fx_sw->xstate_size < min_xstate_size ||
-	    fx_sw->xstate_size > xstate_size ||
-	    fx_sw->xstate_size > fx_sw->extended_size)
-		return -1;
-
-	/*
-	 * Check for the presence of second magic word at the end of memory
-	 * layout. This detects the case where the user just copied the legacy
-	 * fpstate layout with out copying the extended state information
-	 * in the memory layout.
-	 */
-	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size))
-	    || magic2 != FP_XSTATE_MAGIC2)
-		return -1;
-
-	return 0;
-}
-
-/*
- * Signal frame handlers.
- */
-static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
-{
-	if (use_fxsr()) {
-		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-		struct user_i387_ia32_struct env;
-		struct _fpstate_ia32 __user *fp = buf;
-
-		convert_from_fxsr(&env, tsk);
-
-		if (__copy_to_user(buf, &env, sizeof(env)) ||
-		    __put_user(xsave->i387.swd, &fp->status) ||
-		    __put_user(X86_FXSR_MAGIC, &fp->magic))
-			return -1;
-	} else {
-		struct i387_fsave_struct __user *fp = buf;
-		u32 swd;
-		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
-			return -1;
-	}
-
-	return 0;
-}
-
-static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
-{
-	struct xsave_struct __user *x = buf;
-	struct _fpx_sw_bytes *sw_bytes;
-	u32 xstate_bv;
-	int err;
-
-	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */
-	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
-	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
-
-	if (!use_xsave())
-		return err;
-
-	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
-
-	/*
-	 * Read the xstate_bv which we copied (directly from the cpu or
-	 * from the state in task struct) to the user buffers.
-	 */
-	err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-	/*
-	 * For legacy compatible, we always set FP/SSE bits in the bit
-	 * vector while saving the state to the user context. This will
-	 * enable us capturing any changes(during sigreturn) to
-	 * the FP/SSE bits by the legacy applications which don't touch
-	 * xstate_bv in the xsave header.
-	 *
-	 * xsave aware apps can change the xstate_bv in the xsave
-	 * header as well as change any contents in the memory layout.
-	 * xrestore as part of sigreturn will capture all the changes.
-	 */
-	xstate_bv |= XSTATE_FPSSE;
-
-	err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
-
-	return err;
-}
-
-static inline int save_user_xstate(struct xsave_struct __user *buf)
-{
-	int err;
-
-	if (use_xsave())
-		err = xsave_user(buf);
-	else if (use_fxsr())
-		err = fxsave_user((struct i387_fxsave_struct __user *) buf);
-	else
-		err = fsave_user((struct i387_fsave_struct __user *) buf);
-
-	if (unlikely(err) && __clear_user(buf, xstate_size))
-		err = -EFAULT;
-	return err;
-}
-
-/*
- * Save the fpu, extended register state to the user signal frame.
- *
- * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
- *  state is copied.
- *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
- *
- *	buf == buf_fx for 64-bit frames and 32-bit fsave frame.
- *	buf != buf_fx for 32-bit frames with fxstate.
- *
- * If the fpu, extended register state is live, save the state directly
- * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
- * copy the thread's fpu state to the user frame starting at 'buf_fx'.
- *
- * If this is a 32-bit frame with fxstate, put a fsave header before
- * the aligned state at 'buf_fx'.
- *
- * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
- * indicating the absence/presence of the extended state to the user.
- */
-int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
-	struct task_struct *tsk = current;
-	int ia32_fxstate = (buf != buf_fx);
-
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
-
-	if (!access_ok(VERIFY_WRITE, buf, size))
-		return -EACCES;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_get(current, NULL, 0,
-			sizeof(struct user_i387_ia32_struct), NULL,
-			(struct _fpstate_ia32 __user *) buf) ? -1 : 1;
-
-	if (user_has_fpu()) {
-		/* Save the live register state to the user directly. */
-		if (save_user_xstate(buf_fx))
-			return -1;
-		/* Update the thread's fxstate to save the fsave header. */
-		if (ia32_fxstate)
-			fpu_fxsave(&tsk->thread.fpu);
-	} else {
-		sanitize_i387_state(tsk);
-		if (__copy_to_user(buf_fx, xsave, xstate_size))
-			return -1;
-	}
-
-	/* Save the fsave header for the 32-bit frames. */
-	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
-		return -1;
-
-	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
-		return -1;
-
-	return 0;
-}
-
-static inline void
-sanitize_restored_xstate(struct task_struct *tsk,
-			 struct user_i387_ia32_struct *ia32_env,
-			 u64 xstate_bv, int fx_only)
-{
-	struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
-	struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
-
-	if (use_xsave()) {
-		/* These bits must be zero. */
-		memset(xsave_hdr->reserved, 0, 48);
-
-		/*
-		 * Init the state that is not present in the memory
-		 * layout and not enabled by the OS.
-		 */
-		if (fx_only)
-			xsave_hdr->xstate_bv = XSTATE_FPSSE;
-		else
-			xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
-	}
-
-	if (use_fxsr()) {
-		/*
-		 * mscsr reserved bits must be masked to zero for security
-		 * reasons.
-		 */
-		xsave->i387.mxcsr &= mxcsr_feature_mask;
-
-		convert_to_fxsr(tsk, ia32_env);
-	}
-}
-
-/*
- * Restore the extended state if present. Otherwise, restore the FP/SSE state.
- */
-static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
-{
-	if (use_xsave()) {
-		if ((unsigned long)buf % 64 || fx_only) {
-			u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
-			xrstor_state(init_xstate_buf, init_bv);
-			return fxrstor_user(buf);
-		} else {
-			u64 init_bv = pcntxt_mask & ~xbv;
-			if (unlikely(init_bv))
-				xrstor_state(init_xstate_buf, init_bv);
-			return xrestore_user(buf, xbv);
-		}
-	} else if (use_fxsr()) {
-		return fxrstor_user(buf);
-	} else
-		return frstor_user(buf);
-}
-
-int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
-{
-	int ia32_fxstate = (buf != buf_fx);
-	struct task_struct *tsk = current;
-	int state_size = xstate_size;
-	u64 xstate_bv = 0;
-	int fx_only = 0;
-
-	ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
-			 config_enabled(CONFIG_IA32_EMULATION));
-
-	if (!buf) {
-		fpu_reset_state(tsk);
-		return 0;
-	}
-
-	if (!access_ok(VERIFY_READ, buf, size))
-		return -EACCES;
-
-	if (!used_math() && init_fpu(tsk))
-		return -1;
-
-	if (!static_cpu_has(X86_FEATURE_FPU))
-		return fpregs_soft_set(current, NULL,
-				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) != 0;
-
-	if (use_xsave()) {
-		struct _fpx_sw_bytes fx_sw_user;
-		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) {
-			/*
-			 * Couldn't find the extended state information in the
-			 * memory layout. Restore just the FP/SSE and init all
-			 * the other extended state.
-			 */
-			state_size = sizeof(struct i387_fxsave_struct);
-			fx_only = 1;
-		} else {
-			state_size = fx_sw_user.xstate_size;
-			xstate_bv = fx_sw_user.xstate_bv;
-		}
-	}
-
-	if (ia32_fxstate) {
-		/*
-		 * For 32-bit frames with fxstate, copy the user state to the
-		 * thread's fpu state, reconstruct fxstate from the fsave
-		 * header. Sanitize the copied state etc.
-		 */
-		struct fpu *fpu = &tsk->thread.fpu;
-		struct user_i387_ia32_struct env;
-		int err = 0;
-
-		/*
-		 * Drop the current fpu which clears used_math(). This ensures
-		 * that any context-switch during the copy of the new state,
-		 * avoids the intermediate state from getting restored/saved.
-		 * Thus avoiding the new restored state from getting corrupted.
-		 * We will be ready to restore/save the state only after
-		 * set_used_math() is again set.
-		 */
-		drop_fpu(tsk);
-
-		if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) ||
-		    __copy_from_user(&env, buf, sizeof(env))) {
-			fpu_finit(fpu);
-			err = -1;
-		} else {
-			sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
-		}
-
-		set_used_math();
-		if (use_eager_fpu()) {
-			preempt_disable();
-			math_state_restore();
-			preempt_enable();
-		}
-
-		return err;
-	} else {
-		/*
-		 * For 64-bit frames and 32-bit fsave frames, restore the user
-		 * state to the registers directly (with exceptions handled).
-		 */
-		user_fpu_begin();
-		if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
-			fpu_reset_state(tsk);
-			return -1;
-		}
-	}
-
-	return 0;
-}
-
-/*
- * Prepare the SW reserved portion of the fxsave memory layout, indicating
- * the presence of the extended state information in the memory layout
- * pointed by the fpstate pointer in the sigcontext.
- * This will be saved when ever the FP and extended state context is
- * saved on the user stack during the signal handler delivery to the user.
- */
-static void prepare_fx_sw_frame(void)
-{
-	int fsave_header_size = sizeof(struct i387_fsave_struct);
-	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE;
-
-	if (config_enabled(CONFIG_X86_32))
-		size += fsave_header_size;
-
-	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
-	fx_sw_reserved.extended_size = size;
-	fx_sw_reserved.xstate_bv = pcntxt_mask;
-	fx_sw_reserved.xstate_size = xstate_size;
-
-	if (config_enabled(CONFIG_IA32_EMULATION)) {
-		fx_sw_reserved_ia32 = fx_sw_reserved;
-		fx_sw_reserved_ia32.extended_size += fsave_header_size;
-	}
-}
-
-/*
- * Enable the extended processor state save/restore feature
- */
-static inline void xstate_enable(void)
-{
-	cr4_set_bits(X86_CR4_OSXSAVE);
-	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-}
-
-/*
- * Record the offsets and sizes of different state managed by the xsave
- * memory layout.
- */
-static void __init setup_xstate_features(void)
-{
-	int eax, ebx, ecx, edx, leaf = 0x2;
-
-	xstate_features = fls64(pcntxt_mask);
-	xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
-	xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
-
-	do {
-		cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
-
-		if (eax == 0)
-			break;
-
-		xstate_offsets[leaf] = ebx;
-		xstate_sizes[leaf] = eax;
-
-		leaf++;
-	} while (1);
-}
-
-/*
- * This function sets up offsets and sizes of all extended states in
- * xsave area. This supports both standard format and compacted format
- * of the xsave aread.
- *
- * Input: void
- * Output: void
- */
-void setup_xstate_comp(void)
-{
-	unsigned int xstate_comp_sizes[sizeof(pcntxt_mask)*8];
-	int i;
-
-	/*
-	 * The FP xstates and SSE xstates are legacy states. They are always
-	 * in the fixed offsets in the xsave area in either compacted form
-	 * or standard form.
-	 */
-	xstate_comp_offsets[0] = 0;
-	xstate_comp_offsets[1] = offsetof(struct i387_fxsave_struct, xmm_space);
-
-	if (!cpu_has_xsaves) {
-		for (i = 2; i < xstate_features; i++) {
-			if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-				xstate_comp_offsets[i] = xstate_offsets[i];
-				xstate_comp_sizes[i] = xstate_sizes[i];
-			}
-		}
-		return;
-	}
-
-	xstate_comp_offsets[2] = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-
-	for (i = 2; i < xstate_features; i++) {
-		if (test_bit(i, (unsigned long *)&pcntxt_mask))
-			xstate_comp_sizes[i] = xstate_sizes[i];
-		else
-			xstate_comp_sizes[i] = 0;
-
-		if (i > 2)
-			xstate_comp_offsets[i] = xstate_comp_offsets[i-1]
-					+ xstate_comp_sizes[i-1];
-
-	}
-}
-
-/*
- * setup the xstate image representing the init state
- */
-static void __init setup_init_fpu_buf(void)
-{
-	/*
-	 * Setup init_xstate_buf to represent the init state of
-	 * all the features managed by the xsave
-	 */
-	init_xstate_buf = alloc_bootmem_align(xstate_size,
-					      __alignof__(struct xsave_struct));
-	fx_finit(&init_xstate_buf->i387);
-
-	if (!cpu_has_xsave)
-		return;
-
-	setup_xstate_features();
-
-	if (cpu_has_xsaves) {
-		init_xstate_buf->xsave_hdr.xcomp_bv =
-						(u64)1 << 63 | pcntxt_mask;
-		init_xstate_buf->xsave_hdr.xstate_bv = pcntxt_mask;
-	}
-
-	/*
-	 * Init all the features state with header_bv being 0x0
-	 */
-	xrstor_state_booting(init_xstate_buf, -1);
-	/*
-	 * Dump the init state again. This is to identify the init state
-	 * of any feature which is not represented by all zero's.
-	 */
-	xsave_state_booting(init_xstate_buf, -1);
-}
-
-static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
-static int __init eager_fpu_setup(char *s)
-{
-	if (!strcmp(s, "on"))
-		eagerfpu = ENABLE;
-	else if (!strcmp(s, "off"))
-		eagerfpu = DISABLE;
-	else if (!strcmp(s, "auto"))
-		eagerfpu = AUTO;
-	return 1;
-}
-__setup("eagerfpu=", eager_fpu_setup);
-
-
-/*
- * Calculate total size of enabled xstates in XCR0/pcntxt_mask.
- */
-static void __init init_xstate_size(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	int i;
-
-	if (!cpu_has_xsaves) {
-		cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-		xstate_size = ebx;
-		return;
-	}
-
-	xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE;
-	for (i = 2; i < 64; i++) {
-		if (test_bit(i, (unsigned long *)&pcntxt_mask)) {
-			cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
-			xstate_size += eax;
-		}
-	}
-}
-
-/*
- * Enable and initialize the xsave feature.
- */
-static void __init xstate_enable_boot_cpu(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-
-	if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
-		WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
-		return;
-	}
-
-	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
-	pcntxt_mask = eax + ((u64)edx << 32);
-
-	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
-		pr_err("FP/SSE not shown under xsave features 0x%llx\n",
-		       pcntxt_mask);
-		BUG();
-	}
-
-	/*
-	 * Support only the state known to OS.
-	 */
-	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
-
-	xstate_enable();
-
-	/*
-	 * Recompute the context size for enabled features
-	 */
-	init_xstate_size();
-
-	update_regset_xstate_info(xstate_size, pcntxt_mask);
-	prepare_fx_sw_frame();
-	setup_init_fpu_buf();
-
-	/* Auto enable eagerfpu for xsaveopt */
-	if (cpu_has_xsaveopt && eagerfpu != DISABLE)
-		eagerfpu = ENABLE;
-
-	if (pcntxt_mask & XSTATE_EAGER) {
-		if (eagerfpu == DISABLE) {
-			pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
-					pcntxt_mask & XSTATE_EAGER);
-			pcntxt_mask &= ~XSTATE_EAGER;
-		} else {
-			eagerfpu = ENABLE;
-		}
-	}
-
-	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x using %s\n",
-		pcntxt_mask, xstate_size,
-		cpu_has_xsaves ? "compacted form" : "standard form");
-}
-
-/*
- * For the very first instance, this calls xstate_enable_boot_cpu();
- * for all subsequent instances, this calls xstate_enable().
- *
- * This is somewhat obfuscated due to the lack of powerful enough
- * overrides for the section checks.
- */
-void xsave_init(void)
-{
-	static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
-	void (*this_func)(void);
-
-	if (!cpu_has_xsave)
-		return;
-
-	this_func = next_func;
-	next_func = xstate_enable;
-	this_func();
-}
-
-/*
- * setup_init_fpu_buf() is __init and it is OK to call it here because
- * init_xstate_buf will be unset only once during boot.
- */
-void __init_refok eager_fpu_init(void)
-{
-	WARN_ON(used_math());
-	current_thread_info()->status = 0;
-
-	if (eagerfpu == ENABLE)
-		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
-	if (!cpu_has_eager_fpu) {
-		stts();
-		return;
-	}
-
-	if (!init_xstate_buf)
-		setup_init_fpu_buf();
-}
-
-/*
- * Given the xsave area and a state inside, this function returns the
- * address of the state.
- *
- * This is the API that is called to get xstate address in either
- * standard format or compacted format of xsave area.
- *
- * Inputs:
- *	xsave: base address of the xsave area;
- *	xstate: state which is defined in xsave.h (e.g. XSTATE_FP, XSTATE_SSE,
- *	etc.)
- * Output:
- *	address of the state in the xsave area.
- */
-void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
-{
-	int feature = fls64(xstate) - 1;
-	if (!test_bit(feature, (unsigned long *)&pcntxt_mask))
-		return NULL;
-
-	return (void *)xsave + xstate_comp_offsets[feature];
-}
-EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 413a7bf9efbb..d8a1d56276e1 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,15 +86,16 @@ config KVM_MMU_AUDIT
 	 auditing of KVM MMU events at runtime.
 
 config KVM_DEVICE_ASSIGNMENT
-	bool "KVM legacy PCI device assignment support"
+	bool "KVM legacy PCI device assignment support (DEPRECATED)"
 	depends on KVM && PCI && IOMMU_API
-	default y
+	default n
 	---help---
 	  Provide support for legacy PCI device assignment through KVM.  The
 	  kernel now also supports a full featured userspace device driver
-	  framework through VFIO, which supersedes much of this support.
+	  framework through VFIO, which supersedes this support and provides
+	  better security.
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 16e8f962eaad..67d215cb8953 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,10 +12,10 @@ kvm-y			+= $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(KVM)/async_pf.o
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
-			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+			   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o
 kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= assigned-dev.o iommu.o
-kvm-intel-y		+= vmx.o
-kvm-amd-y		+= svm.o
+kvm-intel-y		+= vmx.o pmu_intel.o
+kvm-amd-y		+= svm.o pmu_amd.o
 
 obj-$(CONFIG_KVM)	+= kvm.o
 obj-$(CONFIG_KVM_INTEL)	+= kvm-intel.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59b69f6a2844..64dd46793099 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,12 +16,14 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <asm/fpu/internal.h> /* For use_eager_fpu.  Ugh! */
 #include <asm/user.h>
-#include <asm/xsave.h>
+#include <asm/fpu/xstate.h>
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
 #include "trace.h"
+#include "pmu.h"
 
 static u32 xstate_required_size(u64 xstate_bv, bool compacted)
 {
@@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
 		best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
+	vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
+
 	/*
 	 * The existing code assumes virtual address is 48-bit in the canonical
 	 * address checks; exit if it is ever changed.
@@ -107,7 +111,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 	/* Update physical-address width */
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
-	kvm_pmu_cpuid_update(vcpu);
+	kvm_pmu_refresh(vcpu);
 	return 0;
 }
 
@@ -411,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 6: /* Thermal management */
+		entry->eax = 0x4; /* allow ARAT */
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
 	case 7: {
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* Mask ebx against host capability word 9 */
@@ -587,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	case 3: /* Processor serial number */
 	case 5: /* MONITOR/MWAIT */
-	case 6: /* Thermal management */
 	case 0xC0000002:
 	case 0xC0000003:
 	case 0xC0000004:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c3b1ad9fca81..dd05b9cef6ae 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
 	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
 }
 
+static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->edx & bit(X86_FEATURE_LM));
+}
+
 static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -117,4 +125,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_RTM));
 }
+
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
 #endif
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 630bcb0d7a04..e7a4fde5d631 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <asm/kvm_emulate.h>
 #include <linux/stringify.h>
+#include <asm/debugreg.h>
 
 #include "x86.h"
 #include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
 static inline void
 register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
 {
-	ulong mask;
+	ulong *preg = reg_rmw(ctxt, reg);
 
-	if (ctxt->ad_bytes == sizeof(unsigned long))
-		mask = ~0UL;
-	else
-		mask = ad_mask(ctxt);
-	masked_increment(reg_rmw(ctxt, reg), mask, inc);
+	assign_register(preg, *preg + inc, ctxt->ad_bytes);
 }
 
 static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -2262,6 +2259,260 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+	u32 eax, ebx, ecx, edx;
+
+	eax = 0x80000001;
+	ecx = 0;
+	ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+	return edx & bit(X86_FEATURE_LM);
+}
+
+#define GET_SMSTATE(type, smbase, offset)				  \
+	({								  \
+	 type __val;							  \
+	 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val,       \
+				     sizeof(__val), NULL);		  \
+	 if (r != X86EMUL_CONTINUE)					  \
+		 return X86EMUL_UNHANDLEABLE;				  \
+	 __val;								  \
+	})
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+	desc->g    = (flags >> 23) & 1;
+	desc->d    = (flags >> 22) & 1;
+	desc->l    = (flags >> 21) & 1;
+	desc->avl  = (flags >> 20) & 1;
+	desc->p    = (flags >> 15) & 1;
+	desc->dpl  = (flags >> 13) & 3;
+	desc->s    = (flags >> 12) & 1;
+	desc->type = (flags >>  8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+	struct desc_struct desc;
+	int offset;
+	u16 selector;
+
+	selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+
+	if (n < 3)
+		offset = 0x7f84 + n * 12;
+	else
+		offset = 0x7f2c + (n - 3) * 12;
+
+	set_desc_base(&desc,      GET_SMSTATE(u32, smbase, offset + 8));
+	set_desc_limit(&desc,     GET_SMSTATE(u32, smbase, offset + 4));
+	rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+	ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+	return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+	struct desc_struct desc;
+	int offset;
+	u16 selector;
+	u32 base3;
+
+	offset = 0x7e00 + n * 16;
+
+	selector =                GET_SMSTATE(u16, smbase, offset);
+	rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
+	set_desc_limit(&desc,     GET_SMSTATE(u32, smbase, offset + 4));
+	set_desc_base(&desc,      GET_SMSTATE(u32, smbase, offset + 8));
+	base3 =                   GET_SMSTATE(u32, smbase, offset + 12);
+
+	ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+	return X86EMUL_CONTINUE;
+}
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+				     u64 cr0, u64 cr4)
+{
+	int bad;
+
+	/*
+	 * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+	 * Then enable protected mode.	However, PCID cannot be enabled
+	 * if EFER.LMA=0, so set it separately.
+	 */
+	bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+	if (bad)
+		return X86EMUL_UNHANDLEABLE;
+
+	bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+	if (bad)
+		return X86EMUL_UNHANDLEABLE;
+
+	if (cr4 & X86_CR4_PCIDE) {
+		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+		if (bad)
+			return X86EMUL_UNHANDLEABLE;
+	}
+
+	return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct desc_struct desc;
+	struct desc_ptr dt;
+	u16 selector;
+	u32 val, cr0, cr4;
+	int i;
+
+	cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc);
+	ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+	ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+	ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0);
+
+	for (i = 0; i < 8; i++)
+		*reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+
+	val = GET_SMSTATE(u32, smbase, 0x7fcc);
+	ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+	val = GET_SMSTATE(u32, smbase, 0x7fc8);
+	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+	selector =                 GET_SMSTATE(u32, smbase, 0x7fc4);
+	set_desc_base(&desc,       GET_SMSTATE(u32, smbase, 0x7f64));
+	set_desc_limit(&desc,      GET_SMSTATE(u32, smbase, 0x7f60));
+	rsm_set_desc_flags(&desc,  GET_SMSTATE(u32, smbase, 0x7f5c));
+	ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+	selector =                 GET_SMSTATE(u32, smbase, 0x7fc0);
+	set_desc_base(&desc,       GET_SMSTATE(u32, smbase, 0x7f80));
+	set_desc_limit(&desc,      GET_SMSTATE(u32, smbase, 0x7f7c));
+	rsm_set_desc_flags(&desc,  GET_SMSTATE(u32, smbase, 0x7f78));
+	ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+	dt.address =               GET_SMSTATE(u32, smbase, 0x7f74);
+	dt.size =                  GET_SMSTATE(u32, smbase, 0x7f70);
+	ctxt->ops->set_gdt(ctxt, &dt);
+
+	dt.address =               GET_SMSTATE(u32, smbase, 0x7f58);
+	dt.size =                  GET_SMSTATE(u32, smbase, 0x7f54);
+	ctxt->ops->set_idt(ctxt, &dt);
+
+	for (i = 0; i < 6; i++) {
+		int r = rsm_load_seg_32(ctxt, smbase, i);
+		if (r != X86EMUL_CONTINUE)
+			return r;
+	}
+
+	cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+
+	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+
+	return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct desc_struct desc;
+	struct desc_ptr dt;
+	u64 val, cr0, cr4;
+	u32 base3;
+	u16 selector;
+	int i;
+
+	for (i = 0; i < 16; i++)
+		*reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+
+	ctxt->_eip   = GET_SMSTATE(u64, smbase, 0x7f78);
+	ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+	val = GET_SMSTATE(u32, smbase, 0x7f68);
+	ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+	val = GET_SMSTATE(u32, smbase, 0x7f60);
+	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+	cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58);
+	ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50));
+	cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);
+	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
+	val =                       GET_SMSTATE(u64, smbase, 0x7ed0);
+	ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+	selector =                  GET_SMSTATE(u32, smbase, 0x7e90);
+	rsm_set_desc_flags(&desc,   GET_SMSTATE(u32, smbase, 0x7e92) << 8);
+	set_desc_limit(&desc,       GET_SMSTATE(u32, smbase, 0x7e94));
+	set_desc_base(&desc,        GET_SMSTATE(u32, smbase, 0x7e98));
+	base3 =                     GET_SMSTATE(u32, smbase, 0x7e9c);
+	ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+	dt.size =                   GET_SMSTATE(u32, smbase, 0x7e84);
+	dt.address =                GET_SMSTATE(u64, smbase, 0x7e88);
+	ctxt->ops->set_idt(ctxt, &dt);
+
+	selector =                  GET_SMSTATE(u32, smbase, 0x7e70);
+	rsm_set_desc_flags(&desc,   GET_SMSTATE(u32, smbase, 0x7e72) << 8);
+	set_desc_limit(&desc,       GET_SMSTATE(u32, smbase, 0x7e74));
+	set_desc_base(&desc,        GET_SMSTATE(u32, smbase, 0x7e78));
+	base3 =                     GET_SMSTATE(u32, smbase, 0x7e7c);
+	ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+	dt.size =                   GET_SMSTATE(u32, smbase, 0x7e64);
+	dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);
+	ctxt->ops->set_gdt(ctxt, &dt);
+
+	for (i = 0; i < 6; i++) {
+		int r = rsm_load_seg_64(ctxt, smbase, i);
+		if (r != X86EMUL_CONTINUE)
+			return r;
+	}
+
+	return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+	unsigned long cr0, cr4, efer;
+	u64 smbase;
+	int ret;
+
+	if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+		return emulate_ud(ctxt);
+
+	/*
+	 * Get back to real mode, to prepare a safe state in which to load
+	 * CR0/CR3/CR4/EFER.  Also this will ensure that addresses passed
+	 * to read_std/write_std are not virtual.
+	 *
+	 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+	 */
+	cr0 = ctxt->ops->get_cr(ctxt, 0);
+	if (cr0 & X86_CR0_PE)
+		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+	cr4 = ctxt->ops->get_cr(ctxt, 4);
+	if (cr4 & X86_CR4_PAE)
+		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+	efer = 0;
+	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+
+	smbase = ctxt->ops->get_smbase(ctxt);
+	if (emulator_has_longmode(ctxt))
+		ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+	else
+		ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+	if (ret != X86EMUL_CONTINUE) {
+		/* FIXME: should triple fault */
+		return X86EMUL_UNHANDLEABLE;
+	}
+
+	if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+		ctxt->ops->set_nmi_mask(ctxt, false);
+
+	ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
+	ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+	return X86EMUL_CONTINUE;
+}
+
 static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct desc_struct *cs, struct desc_struct *ss)
@@ -2573,6 +2824,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+	/*
+	 * Intel CPUs mask the counter and pointers in quite strange
+	 * manner when ECX is zero due to REP-string optimizations.
+	 */
+#ifdef CONFIG_X86_64
+	if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+		return;
+
+	*reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+	switch (ctxt->b) {
+	case 0xa4:	/* movsb */
+	case 0xa5:	/* movsd/w */
+		*reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+		/* fall through */
+	case 0xaa:	/* stosb */
+	case 0xab:	/* stosd/w */
+		*reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+	}
+#endif
+}
+
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 				struct tss_segment_16 *tss)
 {
@@ -2849,7 +3124,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	ulong old_tss_base =
 		ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
 	u32 desc_limit;
-	ulong desc_addr;
+	ulong desc_addr, dr7;
 
 	/* FIXME: old_tss_base == ~0 ? */
 
@@ -2934,6 +3209,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		ret = em_push(ctxt);
 	}
 
+	ops->get_dr(ctxt, 7, &dr7);
+	ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
 	return ret;
 }
 
@@ -3840,7 +4118,7 @@ static const struct opcode group5[] = {
 	F(DstMem | SrcNone | Lock,		em_inc),
 	F(DstMem | SrcNone | Lock,		em_dec),
 	I(SrcMem | NearBranch,			em_call_near_abs),
-	I(SrcMemFAddr | ImplicitOps | Stack,	em_call_far),
+	I(SrcMemFAddr | ImplicitOps,		em_call_far),
 	I(SrcMem | NearBranch,			em_jmp_abs),
 	I(SrcMemFAddr | ImplicitOps,		em_jmp_far),
 	I(SrcMem | Stack,			em_push), D(Undefined),
@@ -4173,7 +4451,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	DI(ImplicitOps, rsm),
+	II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
@@ -4871,7 +5149,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				fetch_possible_mmx_operand(ctxt, &ctxt->dst);
 		}
 
-		if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_PRE_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -4900,7 +5178,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 				goto done;
 		}
 
-		if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+		if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 			rc = emulator_check_intercept(ctxt, ctxt->intercept,
 						      X86_ICPT_POST_EXCEPT);
 			if (rc != X86EMUL_CONTINUE)
@@ -4910,6 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 		if (ctxt->rep_prefix && (ctxt->d & String)) {
 			/* All REP prefixes have the same first termination condition */
 			if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+				string_registers_quirk(ctxt);
 				ctxt->eip = ctxt->_eip;
 				ctxt->eflags &= ~X86_EFLAGS_RF;
 				goto done;
@@ -4953,7 +5232,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 
 special_insn:
 
-	if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+	if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
 		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_MEMACCESS);
 		if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 28146f03c514..856f79105bb5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 	irqe.delivery_mode = entry->fields.delivery_mode << 8;
 	irqe.level = 1;
 	irqe.shorthand = 0;
+	irqe.msi_redir_hint = false;
 
 	if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
 		ioapic->irr_delivered |= 1 << irq;
@@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
 	cancel_delayed_work_sync(&ioapic->eoi_inject);
-	if (ioapic) {
-		kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
-		kvm->arch.vioapic = NULL;
-		kfree(ioapic);
-	}
+	kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+	kvm->arch.vioapic = NULL;
+	kfree(ioapic);
 }
 
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 72298b3ac025..9efff9e5b58c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -31,6 +31,8 @@
 
 #include "ioapic.h"
 
+#include "lapic.h"
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 				line_status);
 }
 
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
-	return irq->delivery_mode == APIC_DM_LOWEST;
-}
-
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map)
 {
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	struct kvm_vcpu *vcpu, *lowest = NULL;
 
 	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
-			kvm_is_dm_lowest_prio(irq)) {
+			kvm_lowest_prio_delivery(irq)) {
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 		irq->delivery_mode = APIC_DM_FIXED;
 	}
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 					irq->dest_id, irq->dest_mode))
 			continue;
 
-		if (!kvm_is_dm_lowest_prio(irq)) {
+		if (!kvm_lowest_prio_delivery(irq)) {
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 	irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
 	irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
 	irq->delivery_mode = e->msi.data & 0x700;
+	irq->msi_redir_hint = ((e->msi.address_lo
+		& MSI_ADDR_REDIRECTION_LOWPRI) > 0);
 	irq->level = 1;
 	irq->shorthand = 0;
-	/* TODO Deal with RH bit of MSI message address */
 }
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 544076c4f44b..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
 	return vcpu->arch.hflags & HF_GUEST_MASK;
 }
 
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 629af0f1c5c4..36e9de1b4127 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
 	recalculate_apic_map(apic->vcpu->kvm);
 }
 
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+{
+	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+
+	apic_set_reg(apic, APIC_ID, id << 24);
+	apic_set_reg(apic, APIC_LDR, ldr);
+	recalculate_apic_map(apic->vcpu->kvm);
+}
+
 static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
 {
 	return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -728,7 +737,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 		dst = map->logical_map[cid];
 
-		if (irq->delivery_mode == APIC_DM_LOWEST) {
+		if (kvm_lowest_prio_delivery(irq)) {
 			int l = -1;
 			for_each_set_bit(i, &bitmap, 16) {
 				if (!dst[i])
@@ -799,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_SMI:
-		apic_debug("Ignoring guest SMI\n");
+		result = 1;
+		kvm_make_request(KVM_REQ_SMI, vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_NMI:
@@ -914,9 +925,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	irq.vector = icr_low & APIC_VECTOR_MASK;
 	irq.delivery_mode = icr_low & APIC_MODE_MASK;
 	irq.dest_mode = icr_low & APIC_DEST_MASK;
-	irq.level = icr_low & APIC_INT_ASSERT;
+	irq.level = (icr_low & APIC_INT_ASSERT) != 0;
 	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
 	irq.shorthand = icr_low & APIC_SHORT_MASK;
+	irq.msi_redir_hint = false;
 	if (apic_x2apic_mode(apic))
 		irq.dest_id = icr_high;
 	else
@@ -926,10 +938,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
-		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+		   "msi_redir_hint 0x%x\n",
 		   icr_high, icr_low, irq.shorthand, irq.dest_id,
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
-		   irq.vector);
+		   irq.vector, irq.msi_redir_hint);
 
 	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
 }
@@ -1090,6 +1103,17 @@ static void update_divide_count(struct kvm_lapic *apic)
 				   apic->divide_count);
 }
 
+static void apic_update_lvtt(struct kvm_lapic *apic)
+{
+	u32 timer_mode = kvm_apic_get_reg(apic, APIC_LVTT) &
+			apic->lapic_timer.timer_mode_mask;
+
+	if (apic->lapic_timer.timer_mode != timer_mode) {
+		apic->lapic_timer.timer_mode = timer_mode;
+		hrtimer_cancel(&apic->lapic_timer.timer);
+	}
+}
+
 static void apic_timer_expired(struct kvm_lapic *apic)
 {
 	struct kvm_vcpu *vcpu = apic->vcpu;
@@ -1298,6 +1322,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
 					     lvt_val | APIC_LVT_MASKED);
 			}
+			apic_update_lvtt(apic);
 			atomic_set(&apic->lapic_timer.pending, 0);
 
 		}
@@ -1330,20 +1355,13 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 
 		break;
 
-	case APIC_LVTT: {
-		u32 timer_mode = val & apic->lapic_timer.timer_mode_mask;
-
-		if (apic->lapic_timer.timer_mode != timer_mode) {
-			apic->lapic_timer.timer_mode = timer_mode;
-			hrtimer_cancel(&apic->lapic_timer.timer);
-		}
-
+	case APIC_LVTT:
 		if (!kvm_apic_sw_enabled(apic))
 			val |= APIC_LVT_MASKED;
 		val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
 		apic_set_reg(apic, APIC_LVTT, val);
+		apic_update_lvtt(apic);
 		break;
-	}
 
 	case APIC_TMICT:
 		if (apic_lvtt_tscdeadline(apic))
@@ -1536,9 +1554,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 	if ((old_value ^ value) & X2APIC_ENABLE) {
 		if (value & X2APIC_ENABLE) {
-			u32 id = kvm_apic_id(apic);
-			u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
-			kvm_apic_set_ldr(apic, ldr);
+			kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
 			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
 		} else
 			kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
@@ -1557,7 +1573,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 
 }
 
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct kvm_lapic *apic;
 	int i;
@@ -1571,19 +1587,22 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	/* Stop the timer in case it's a reset to an active apic */
 	hrtimer_cancel(&apic->lapic_timer.timer);
 
-	kvm_apic_set_id(apic, vcpu->vcpu_id);
+	if (!init_event)
+		kvm_apic_set_id(apic, vcpu->vcpu_id);
 	kvm_apic_set_version(apic->vcpu);
 
 	for (i = 0; i < APIC_LVT_NUM; i++)
 		apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
-	apic->lapic_timer.timer_mode = 0;
-	apic_set_reg(apic, APIC_LVT0,
-		     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+	apic_update_lvtt(apic);
+	if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
+		apic_set_reg(apic, APIC_LVT0,
+			     SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
 
 	apic_set_reg(apic, APIC_DFR, 0xffffffffU);
 	apic_set_spiv(apic, 0xff);
 	apic_set_reg(apic, APIC_TASKPRI, 0);
-	kvm_apic_set_ldr(apic, 0);
+	if (!apic_x2apic_mode(apic))
+		kvm_apic_set_ldr(apic, 0);
 	apic_set_reg(apic, APIC_ESR, 0);
 	apic_set_reg(apic, APIC_ICR, 0);
 	apic_set_reg(apic, APIC_ICR2, 0);
@@ -1712,7 +1731,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 			APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
 
 	static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
-	kvm_lapic_reset(vcpu);
+	kvm_lapic_reset(vcpu, false);
 	kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
 
 	return 0;
@@ -1802,6 +1821,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 
 	apic_update_ppr(apic);
 	hrtimer_cancel(&apic->lapic_timer.timer);
+	apic_update_lvtt(apic);
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
@@ -2043,11 +2063,22 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 	if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
 		return;
 
-	pe = xchg(&apic->pending_events, 0);
+	/*
+	 * INITs are latched while in SMM.  Because an SMM CPU cannot
+	 * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
+	 * and delay processing of INIT until the next RSM.
+	 */
+	if (is_smm(vcpu)) {
+		WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+		if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+			clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+		return;
+	}
 
+	pe = xchg(&apic->pending_events, 0);
 	if (test_bit(KVM_APIC_INIT, &pe)) {
-		kvm_lapic_reset(vcpu);
-		kvm_vcpu_reset(vcpu);
+		kvm_lapic_reset(vcpu, true);
+		kvm_vcpu_reset(vcpu, true);
 		if (kvm_vcpu_is_bsp(apic->vcpu))
 			vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		else
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d28383fc1e7..f2f4e10ab772 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
 int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -150,7 +150,18 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.apic->pending_events;
+	return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+	return (irq->delivery_mode == APIC_DM_LOWEST ||
+			irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+	return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
 }
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d43867c33bc4..f807496b62c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte)
 	return gen;
 }
 
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
 {
-	return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
 }
 
-static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 			   unsigned access)
 {
-	unsigned int gen = kvm_current_mmio_generation(kvm);
+	unsigned int gen = kvm_current_mmio_generation(vcpu);
 	u64 mask = generation_mmio_spte_mask(gen);
 
 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
@@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte)
 	return (spte & ~mask) & ~PAGE_MASK;
 }
 
-static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			  pfn_t pfn, unsigned access)
 {
 	if (unlikely(is_noslot_pfn(pfn))) {
-		mark_mmio_spte(kvm, sptep, gfn, access);
+		mark_mmio_spte(vcpu, sptep, gfn, access);
 		return true;
 	}
 
 	return false;
 }
 
-static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
 	unsigned int kvm_gen, spte_gen;
 
-	kvm_gen = kvm_current_mmio_generation(kvm);
+	kvm_gen = kvm_current_mmio_generation(vcpu);
 	spte_gen = get_mmio_spte_generation(spte);
 
 	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -804,30 +804,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
 	return &slot->arch.lpage_info[level - 2][idx];
 }
 
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
+	gfn_t gfn;
 	int i;
 
-	slot = gfn_to_memslot(kvm, gfn);
-	for (i = PT_DIRECTORY_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+	gfn = sp->gfn;
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->write_count += 1;
 	}
 	kvm->arch.indirect_shadow_pages++;
 }
 
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
+	gfn_t gfn;
 	int i;
 
-	slot = gfn_to_memslot(kvm, gfn);
-	for (i = PT_DIRECTORY_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+	gfn = sp->gfn;
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->write_count -= 1;
 		WARN_ON(linfo->write_count < 0);
@@ -835,14 +841,14 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	kvm->arch.indirect_shadow_pages--;
 }
 
-static int has_wrprotected_page(struct kvm *kvm,
+static int has_wrprotected_page(struct kvm_vcpu *vcpu,
 				gfn_t gfn,
 				int level)
 {
 	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 
-	slot = gfn_to_memslot(kvm, gfn);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
 		return linfo->write_count;
@@ -858,8 +864,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 
 	page_size = kvm_host_page_size(kvm, gfn);
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		if (page_size >= KVM_HPAGE_SIZE(i))
 			ret = i;
 		else
@@ -875,7 +880,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 {
 	struct kvm_memory_slot *slot;
 
-	slot = gfn_to_memslot(vcpu->kvm, gfn);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
 	      (no_dirty_log && slot->dirty_bitmap))
 		slot = NULL;
@@ -900,7 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+		if (has_wrprotected_page(vcpu, large_gfn, level))
 			break;
 
 	return level - 1;
@@ -1042,12 +1047,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
 /*
  * Take gfn and return the reverse mapping to it.
  */
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
 {
+	struct kvm_memslots *slots;
 	struct kvm_memory_slot *slot;
 
-	slot = gfn_to_memslot(kvm, gfn);
-	return __gfn_to_rmap(gfn, level, slot);
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	return __gfn_to_rmap(gfn, sp->role.level, slot);
 }
 
 static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1065,7 +1072,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
 	return pte_list_add(vcpu, spte, rmapp);
 }
 
@@ -1077,7 +1084,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
 	sp = page_header(__pa(spte));
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+	rmapp = gfn_to_rmap(kvm, gfn, sp);
 	pte_list_remove(spte, rmapp);
 }
 
@@ -1142,6 +1149,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
 	return NULL;
 }
 
+#define for_each_rmap_spte(_rmap_, _iter_, _spte_)			    \
+	   for (_spte_ = rmap_get_first(*_rmap_, _iter_);		    \
+		_spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;});  \
+			_spte_ = rmap_get_next(_iter_))
+
 static void drop_spte(struct kvm *kvm, u64 *sptep)
 {
 	if (mmu_spte_clear_track_bits(sptep))
@@ -1205,12 +1217,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		flush |= spte_write_protect(kvm, sptep, pt_protect);
-		sptep = rmap_get_next(&iter);
-	}
 
 	return flush;
 }
@@ -1232,12 +1240,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		flush |= spte_clear_dirty(kvm, sptep);
-		sptep = rmap_get_next(&iter);
-	}
 
 	return flush;
 }
@@ -1259,12 +1263,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
 	struct rmap_iterator iter;
 	bool flush = false;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		flush |= spte_set_dirty(kvm, sptep);
-		sptep = rmap_get_next(&iter);
-	}
 
 	return flush;
 }
@@ -1342,42 +1342,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
 		kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
 }
 
-static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
 {
 	struct kvm_memory_slot *slot;
 	unsigned long *rmapp;
 	int i;
 	bool write_protected = false;
 
-	slot = gfn_to_memslot(kvm, gfn);
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
 		rmapp = __gfn_to_rmap(gfn, i, slot);
-		write_protected |= __rmap_write_protect(kvm, rmapp, true);
+		write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
 	}
 
 	return write_protected;
 }
 
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
-			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			   unsigned long data)
+static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
-	int need_tlb_flush = 0;
+	bool flush = false;
 
 	while ((sptep = rmap_get_first(*rmapp, &iter))) {
 		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
-			     sptep, *sptep, gfn, level);
+		rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
 
 		drop_spte(kvm, sptep);
-		need_tlb_flush = 1;
+		flush = true;
 	}
 
-	return need_tlb_flush;
+	return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+			   struct kvm_memory_slot *slot, gfn_t gfn, int level,
+			   unsigned long data)
+{
+	return kvm_zap_rmapp(kvm, rmapp);
 }
 
 static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1394,8 +1397,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	WARN_ON(pte_huge(*ptep));
 	new_pfn = pte_pfn(*ptep);
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
+restart:
+	for_each_rmap_spte(rmapp, &iter, sptep) {
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
 			     sptep, *sptep, gfn, level);
 
@@ -1403,7 +1406,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 		if (pte_write(*ptep)) {
 			drop_spte(kvm, sptep);
-			sptep = rmap_get_first(*rmapp, &iter);
+			goto restart;
 		} else {
 			new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
 			new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1414,7 +1417,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 			mmu_spte_clear_track_bits(sptep);
 			mmu_spte_set(sptep, new_spte);
-			sptep = rmap_get_next(&iter);
 		}
 	}
 
@@ -1424,6 +1426,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	return 0;
 }
 
+struct slot_rmap_walk_iterator {
+	/* input fields. */
+	struct kvm_memory_slot *slot;
+	gfn_t start_gfn;
+	gfn_t end_gfn;
+	int start_level;
+	int end_level;
+
+	/* output fields. */
+	gfn_t gfn;
+	unsigned long *rmap;
+	int level;
+
+	/* private field. */
+	unsigned long *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+	iterator->level = level;
+	iterator->gfn = iterator->start_gfn;
+	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+					   iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+		    struct kvm_memory_slot *slot, int start_level,
+		    int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+	iterator->slot = slot;
+	iterator->start_level = start_level;
+	iterator->end_level = end_level;
+	iterator->start_gfn = start_gfn;
+	iterator->end_gfn = end_gfn;
+
+	rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+	return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+	if (++iterator->rmap <= iterator->end_rmap) {
+		iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+		return;
+	}
+
+	if (++iterator->level > iterator->end_level) {
+		iterator->rmap = NULL;
+		return;
+	}
+
+	rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,	\
+	   _start_gfn, _end_gfn, _iter_)				\
+	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,		\
+				 _end_level_, _start_gfn, _end_gfn);	\
+	     slot_rmap_walk_okay(_iter_);				\
+	     slot_rmap_walk_next(_iter_))
+
 static int kvm_handle_hva_range(struct kvm *kvm,
 				unsigned long start,
 				unsigned long end,
@@ -1435,48 +1505,36 @@ static int kvm_handle_hva_range(struct kvm *kvm,
 					       int level,
 					       unsigned long data))
 {
-	int j;
-	int ret = 0;
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
+	struct slot_rmap_walk_iterator iterator;
+	int ret = 0;
+	int i;
 
-	slots = kvm_memslots(kvm);
-
-	kvm_for_each_memslot(memslot, slots) {
-		unsigned long hva_start, hva_end;
-		gfn_t gfn_start, gfn_end;
-
-		hva_start = max(start, memslot->userspace_addr);
-		hva_end = min(end, memslot->userspace_addr +
-					(memslot->npages << PAGE_SHIFT));
-		if (hva_start >= hva_end)
-			continue;
-		/*
-		 * {gfn(page) | page intersects with [hva_start, hva_end)} =
-		 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
-		 */
-		gfn_start = hva_to_gfn_memslot(hva_start, memslot);
-		gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
-		for (j = PT_PAGE_TABLE_LEVEL;
-		     j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
-			unsigned long idx, idx_end;
-			unsigned long *rmapp;
-			gfn_t gfn = gfn_start;
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(memslot, slots) {
+			unsigned long hva_start, hva_end;
+			gfn_t gfn_start, gfn_end;
 
+			hva_start = max(start, memslot->userspace_addr);
+			hva_end = min(end, memslot->userspace_addr +
+				      (memslot->npages << PAGE_SHIFT));
+			if (hva_start >= hva_end)
+				continue;
 			/*
-			 * {idx(page_j) | page_j intersects with
-			 *  [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+			 * {gfn(page) | page intersects with [hva_start, hva_end)} =
+			 * {gfn_start, gfn_start+1, ..., gfn_end-1}.
 			 */
-			idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
-			idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
-
-			rmapp = __gfn_to_rmap(gfn_start, j, memslot);
-
-			for (; idx <= idx_end;
-			       ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
-				ret |= handler(kvm, rmapp++, memslot,
-					       gfn, j, data);
+			gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+			gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+			for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+						 PT_MAX_HUGEPAGE_LEVEL,
+						 gfn_start, gfn_end - 1,
+						 &iterator)
+				ret |= handler(kvm, iterator.rmap, memslot,
+					       iterator.gfn, iterator.level, data);
 		}
 	}
 
@@ -1518,16 +1576,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 	BUG_ON(!shadow_accessed_mask);
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		if (*sptep & shadow_accessed_mask) {
 			young = 1;
 			clear_bit((ffs(shadow_accessed_mask) - 1),
 				 (unsigned long *)sptep);
 		}
-	}
+
 	trace_kvm_age_page(gfn, level, slot, young);
 	return young;
 }
@@ -1548,15 +1603,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	if (!shadow_accessed_mask)
 		goto out;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-	     sptep = rmap_get_next(&iter)) {
-		BUG_ON(!is_shadow_present_pte(*sptep));
-
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		if (*sptep & shadow_accessed_mask) {
 			young = 1;
 			break;
 		}
-	}
 out:
 	return young;
 }
@@ -1570,7 +1621,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1990,7 +2041,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 		bool protected = false;
 
 		for_each_sp(pages, sp, parents, i)
-			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+			protected |= rmap_write_protect(vcpu, sp->gfn);
 
 		if (protected)
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2088,12 +2139,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	hlist_add_head(&sp->hash_link,
 		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
-		if (rmap_write_protect(vcpu->kvm, gfn))
+		if (rmap_write_protect(vcpu, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			kvm_sync_pages(vcpu, gfn);
 
-		account_shadowed(vcpu->kvm, gfn);
+		account_shadowed(vcpu->kvm, sp);
 	}
 	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	init_shadow_page_table(sp);
@@ -2274,7 +2325,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	kvm_mmu_unlink_parents(kvm, sp);
 
 	if (!sp->role.invalid && !sp->role.direct)
-		unaccount_shadowed(kvm, sp->gfn);
+		unaccount_shadowed(kvm, sp);
 
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
@@ -2386,111 +2437,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
-			 u64 start, u64 end)
-{
-	int i;
-	u64 base, mask;
-	u8 prev_match, curr_match;
-	int num_var_ranges = KVM_NR_VAR_MTRR;
-
-	if (!mtrr_state->enabled)
-		return 0xFF;
-
-	/* Make end inclusive end, instead of exclusive */
-	end--;
-
-	/* Look in fixed ranges. Just return the type as per start */
-	if (mtrr_state->have_fixed && (start < 0x100000)) {
-		int idx;
-
-		if (start < 0x80000) {
-			idx = 0;
-			idx += (start >> 16);
-			return mtrr_state->fixed_ranges[idx];
-		} else if (start < 0xC0000) {
-			idx = 1 * 8;
-			idx += ((start - 0x80000) >> 14);
-			return mtrr_state->fixed_ranges[idx];
-		} else if (start < 0x1000000) {
-			idx = 3 * 8;
-			idx += ((start - 0xC0000) >> 12);
-			return mtrr_state->fixed_ranges[idx];
-		}
-	}
-
-	/*
-	 * Look in variable ranges
-	 * Look of multiple ranges matching this address and pick type
-	 * as per MTRR precedence
-	 */
-	if (!(mtrr_state->enabled & 2))
-		return mtrr_state->def_type;
-
-	prev_match = 0xFF;
-	for (i = 0; i < num_var_ranges; ++i) {
-		unsigned short start_state, end_state;
-
-		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
-			continue;
-
-		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
-		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
-		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
-		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
-		start_state = ((start & mask) == (base & mask));
-		end_state = ((end & mask) == (base & mask));
-		if (start_state != end_state)
-			return 0xFE;
-
-		if ((start & mask) != (base & mask))
-			continue;
-
-		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
-		if (prev_match == 0xFF) {
-			prev_match = curr_match;
-			continue;
-		}
-
-		if (prev_match == MTRR_TYPE_UNCACHABLE ||
-		    curr_match == MTRR_TYPE_UNCACHABLE)
-			return MTRR_TYPE_UNCACHABLE;
-
-		if ((prev_match == MTRR_TYPE_WRBACK &&
-		     curr_match == MTRR_TYPE_WRTHROUGH) ||
-		    (prev_match == MTRR_TYPE_WRTHROUGH &&
-		     curr_match == MTRR_TYPE_WRBACK)) {
-			prev_match = MTRR_TYPE_WRTHROUGH;
-			curr_match = MTRR_TYPE_WRTHROUGH;
-		}
-
-		if (prev_match != curr_match)
-			return MTRR_TYPE_UNCACHABLE;
-	}
-
-	if (prev_match != 0xFF)
-		return prev_match;
-
-	return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
-	u8 mtrr;
-
-	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
-			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
-	if (mtrr == 0xfe || mtrr == 0xff)
-		mtrr = MTRR_TYPE_WRBACK;
-	return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
 static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	trace_kvm_mmu_unsync_page(sp);
@@ -2541,7 +2487,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	u64 spte;
 	int ret = 0;
 
-	if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
+	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
 		return 0;
 
 	spte = PT_PRESENT_MASK;
@@ -2578,7 +2524,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		 * be fixed if guest refault.
 		 */
 		if (level > PT_PAGE_TABLE_LEVEL &&
-		    has_wrprotected_page(vcpu->kvm, gfn, level))
+		    has_wrprotected_page(vcpu, gfn, level))
 			goto done;
 
 		spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2602,7 +2548,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	}
 
 	if (pte_access & ACC_WRITE_MASK) {
-		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 		spte |= shadow_dirty_mask;
 	}
 
@@ -2692,15 +2638,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 				    u64 *start, u64 *end)
 {
 	struct page *pages[PTE_PREFETCH_NUM];
+	struct kvm_memory_slot *slot;
 	unsigned access = sp->role.access;
 	int i, ret;
 	gfn_t gfn;
 
 	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
-	if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
+	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+	if (!slot)
 		return -1;
 
-	ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
 	if (ret <= 0)
 		return -1;
 
@@ -2818,7 +2766,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 		return 1;
 
 	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
+		kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
 		return 0;
 	}
 
@@ -2841,7 +2789,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
 	    level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompound(pfn_to_page(pfn)) &&
-	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+	    !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
 		unsigned long mask;
 		/*
 		 * mmu_notifier_retry was successful and we hold the
@@ -2933,7 +2881,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * Compare with set_spte where instead shadow_dirty_mask is set.
 	 */
 	if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
-		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 
 	return true;
 }
@@ -3388,7 +3336,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 		gfn_t gfn = get_mmio_spte_gfn(spte);
 		unsigned access = get_mmio_spte_access(spte);
 
-		if (!check_mmio_spte(vcpu->kvm, spte))
+		if (!check_mmio_spte(vcpu, spte))
 			return RET_MMIO_PF_INVALID;
 
 		if (direct)
@@ -3460,7 +3408,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 	arch.direct_map = vcpu->arch.mmu.direct_map;
 	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
 
-	return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
+	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3475,10 +3423,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable)
 {
+	struct kvm_memory_slot *slot;
 	bool async;
 
-	*pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	async = false;
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
 	if (!async)
 		return false; /* *pfn has correct page already */
 
@@ -3492,11 +3442,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			return true;
 	}
 
-	*pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
 	return false;
 }
 
+static bool
+check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+	int page_num = KVM_PAGES_PER_HPAGE(level);
+
+	gfn &= ~(page_num - 1);
+
+	return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+}
+
 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 			  bool prefault)
 {
@@ -3522,9 +3481,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 		return r;
 
-	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	if (mapping_level_dirty_bitmap(vcpu, gfn) ||
+	    !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
+		force_pt_level = 1;
+	else
+		force_pt_level = 0;
+
 	if (likely(!force_pt_level)) {
 		level = mapping_level(vcpu, gfn);
+		if (level > PT_DIRECTORY_LEVEL &&
+		    !check_hugepage_cache_consistency(vcpu, gfn, level))
+			level = PT_DIRECTORY_LEVEL;
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 	} else
 		level = PT_PAGE_TABLE_LEVEL;
@@ -3590,7 +3557,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
 	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
 }
 
-static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 			   unsigned access, int *nr_present)
 {
 	if (unlikely(is_mmio_spte(*sptep))) {
@@ -3600,7 +3567,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
 		}
 
 		(*nr_present)++;
-		mark_mmio_spte(kvm, sptep, gfn, access);
+		mark_mmio_spte(vcpu, sptep, gfn, access);
 		return true;
 	}
 
@@ -3736,8 +3703,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 	}
 }
 
-void update_permission_bitmask(struct kvm_vcpu *vcpu,
-		struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+				      struct kvm_mmu *mmu, bool ept)
 {
 	unsigned bit, byte, pfec;
 	u8 map;
@@ -3878,6 +3845,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
 	context->base_role.word = 0;
+	context->base_role.smm = is_smm(vcpu);
 	context->page_fault = tdp_page_fault;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
@@ -3918,6 +3886,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 {
 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
 	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
@@ -3936,6 +3905,9 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 	context->base_role.cr0_wp  = is_write_protection(vcpu);
 	context->base_role.smep_andnot_wp
 		= smep && !is_write_protection(vcpu);
+	context->base_role.smap_andnot_wp
+		= smap && !is_write_protection(vcpu);
+	context->base_role.smm = is_smm(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
@@ -4107,7 +4079,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
 		*gpa &= ~(gpa_t)7;
 		*bytes = 8;
-		r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
+		r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
 		if (r)
 			gentry = 0;
 		new = (const u8 *)&gentry;
@@ -4207,12 +4179,19 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
-	union kvm_mmu_page_role mask = { .word = 0 };
 	struct kvm_mmu_page *sp;
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry, *spte;
 	int npte;
 	bool remote_flush, local_flush, zap_page;
+	union kvm_mmu_page_role mask = { };
+
+	mask.cr0_wp = 1;
+	mask.cr4_pae = 1;
+	mask.nxe = 1;
+	mask.smep_andnot_wp = 1;
+	mask.smap_andnot_wp = 1;
+	mask.smm = 1;
 
 	/*
 	 * If we don't have indirect shadow pages, it means no page is
@@ -4238,7 +4217,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	++vcpu->kvm->stat.mmu_pte_write;
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
-	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
 		if (detect_write_misaligned(sp, gpa, bytes) ||
 		      detect_write_flooding(sp)) {
@@ -4412,36 +4390,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 	init_kvm_mmu(vcpu);
 }
 
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
-				      struct kvm_memory_slot *memslot)
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+
+/* The caller should hold mmu-lock before calling this function. */
+static bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			slot_level_handler fn, int start_level, int end_level,
+			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
 {
-	gfn_t last_gfn;
-	int i;
+	struct slot_rmap_walk_iterator iterator;
 	bool flush = false;
 
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+			end_gfn, &iterator) {
+		if (iterator.rmap)
+			flush |= fn(kvm, iterator.rmap);
 
-	spin_lock(&kvm->mmu_lock);
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			if (flush && lock_flush_tlb) {
+				kvm_flush_remote_tlbs(kvm);
+				flush = false;
+			}
+			cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
 
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		unsigned long *rmapp;
-		unsigned long last_index, index;
+	if (flush && lock_flush_tlb) {
+		kvm_flush_remote_tlbs(kvm);
+		flush = false;
+	}
 
-		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+	return flush;
+}
 
-		for (index = 0; index <= last_index; ++index, ++rmapp) {
-			if (*rmapp)
-				flush |= __rmap_write_protect(kvm, rmapp,
-						false);
+static bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		  slot_level_handler fn, int start_level, int end_level,
+		  bool lock_flush_tlb)
+{
+	return slot_handle_level_range(kvm, memslot, fn, start_level,
+			end_level, memslot->base_gfn,
+			memslot->base_gfn + memslot->npages - 1,
+			lock_flush_tlb);
+}
+
+static bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		      slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		 slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
 
-			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-				cond_resched_lock(&kvm->mmu_lock);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int i;
+
+	spin_lock(&kvm->mmu_lock);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot(memslot, slots) {
+			gfn_t start, end;
+
+			start = max(gfn_start, memslot->base_gfn);
+			end = min(gfn_end, memslot->base_gfn + memslot->npages);
+			if (start >= end)
+				continue;
+
+			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+						start, end - 1, true);
 		}
 	}
 
 	spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+{
+	return __rmap_write_protect(kvm, rmapp, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+				      struct kvm_memory_slot *memslot)
+{
+	bool flush;
+
+	spin_lock(&kvm->mmu_lock);
+	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+				      false);
+	spin_unlock(&kvm->mmu_lock);
 
 	/*
 	 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4474,9 +4531,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 	pfn_t pfn;
 	struct kvm_mmu_page *sp;
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
-		BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+restart:
+	for_each_rmap_spte(rmapp, &iter, sptep) {
 		sp = page_header(__pa(sptep));
 		pfn = spte_to_pfn(*sptep);
 
@@ -4491,71 +4547,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 			!kvm_is_reserved_pfn(pfn) &&
 			PageTransCompound(pfn_to_page(pfn))) {
 			drop_spte(kvm, sptep);
-			sptep = rmap_get_first(*rmapp, &iter);
 			need_tlb_flush = 1;
-		} else
-			sptep = rmap_get_next(&iter);
+			goto restart;
+		}
 	}
 
 	return need_tlb_flush;
 }
 
 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
-			struct kvm_memory_slot *memslot)
+				   const struct kvm_memory_slot *memslot)
 {
-	bool flush = false;
-	unsigned long *rmapp;
-	unsigned long last_index, index;
-
+	/* FIXME: const-ify all uses of struct kvm_memory_slot.  */
 	spin_lock(&kvm->mmu_lock);
-
-	rmapp = memslot->arch.rmap[0];
-	last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
-				memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
-
-	for (index = 0; index <= last_index; ++index, ++rmapp) {
-		if (*rmapp)
-			flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
-
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-			if (flush) {
-				kvm_flush_remote_tlbs(kvm);
-				flush = false;
-			}
-			cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
-
-	if (flush)
-		kvm_flush_remote_tlbs(kvm);
-
+	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+			 kvm_mmu_zap_collapsible_spte, true);
 	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot)
 {
-	gfn_t last_gfn;
-	unsigned long *rmapp;
-	unsigned long last_index, index;
-	bool flush = false;
-
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-
-	rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
-	last_index = gfn_to_index(last_gfn, memslot->base_gfn,
-			PT_PAGE_TABLE_LEVEL);
-
-	for (index = 0; index <= last_index; ++index, ++rmapp) {
-		if (*rmapp)
-			flush |= __rmap_clear_dirty(kvm, rmapp);
-
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-			cond_resched_lock(&kvm->mmu_lock);
-	}
-
+	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
@@ -4574,31 +4590,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
 					struct kvm_memory_slot *memslot)
 {
-	gfn_t last_gfn;
-	int i;
-	bool flush = false;
-
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-
-	for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		unsigned long *rmapp;
-		unsigned long last_index, index;
-
-		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
-		for (index = 0; index <= last_index; ++index, ++rmapp) {
-			if (*rmapp)
-				flush |= __rmap_write_protect(kvm, rmapp,
-						false);
-
-			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-				cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
+	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+					false);
 	spin_unlock(&kvm->mmu_lock);
 
 	/* see kvm_mmu_slot_remove_write_access */
@@ -4612,31 +4608,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 			    struct kvm_memory_slot *memslot)
 {
-	gfn_t last_gfn;
-	int i;
-	bool flush = false;
-
-	last_gfn = memslot->base_gfn + memslot->npages - 1;
+	bool flush;
 
 	spin_lock(&kvm->mmu_lock);
-
-	for (i = PT_PAGE_TABLE_LEVEL;
-	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		unsigned long *rmapp;
-		unsigned long last_index, index;
-
-		rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
-		last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
-		for (index = 0; index <= last_index; ++index, ++rmapp) {
-			if (*rmapp)
-				flush |= __rmap_set_dirty(kvm, rmapp);
-
-			if (need_resched() || spin_needbreak(&kvm->mmu_lock))
-				cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
-
+	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
 	spin_unlock(&kvm->mmu_lock);
 
 	lockdep_assert_held(&kvm->slots_lock);
@@ -4733,13 +4708,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 {
 	/*
 	 * The very rare case: if the generation-number is round,
 	 * zap all shadow pages.
 	 */
-	if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
+	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
 		printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
@@ -4861,15 +4836,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
+	int i;
 
-	slots = kvm_memslots(kvm);
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		slots = __kvm_memslots(kvm, i);
 
-	kvm_for_each_memslot(memslot, slots)
-		nr_pages += memslot->npages;
+		kvm_for_each_memslot(memslot, slots)
+			nr_pages += memslot->npages;
+	}
 
 	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
 	nr_mmu_pages = max(nr_mmu_pages,
-			(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+			   (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
 
 	return nr_mmu_pages;
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c7d65637c851..398d21c0f6dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,6 +43,7 @@
 #define PT_PDPE_LEVEL 3
 #define PT_DIRECTORY_LEVEL 2
 #define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
 
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -71,8 +72,6 @@ enum {
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
-void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
-		bool ept);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
@@ -166,8 +165,11 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	int index = (pfec >> 1) +
 		    (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
 
+	WARN_ON(pfec & PFERR_RSVD_MASK);
+
 	return (mmu->permissions[index] >> pte_access) & 1;
 }
 
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5a4c..a4f62e6f2db2 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 		return;
 
 	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-	pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+	pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
 
 	if (is_error_pfn(pfn))
 		return;
@@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
 	gfn_t gfn;
 
 	rev_sp = page_header(__pa(sptep));
 	gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
-	if (!gfn_to_memslot(kvm, gfn)) {
+	slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
+	slot = __gfn_to_memslot(slots, gfn);
+	if (!slot) {
 		if (!__ratelimit(&ratelimit_state))
 			return;
 		audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
@@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 		return;
 	}
 
-	rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+	rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
 	if (!*rmapp) {
 		if (!__ratelimit(&ratelimit_state))
 			return;
@@ -191,19 +195,21 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
 	unsigned long *rmapp;
 	u64 *sptep;
 	struct rmap_iterator iter;
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *slot;
 
 	if (sp->role.direct || sp->unsync || sp->role.invalid)
 		return;
 
-	rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
+	slots = kvm_memslots_for_spte_role(kvm, sp->role);
+	slot = __gfn_to_memslot(slots, sp->gfn);
+	rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
 
-	for (sptep = rmap_get_first(*rmapp, &iter); sptep;
-	     sptep = rmap_get_next(&iter)) {
+	for_each_rmap_spte(rmapp, &iter, sptep)
 		if (is_writable_pte(*sptep))
 			audit_printk(kvm, "shadow page has writable "
 				     "mappings: gfn %llx role %x\n",
 				     sp->gfn, sp->role.word);
-	}
 }
 
 static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000000..de1d2d8062e2
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,699 @@
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ *   Yaniv Kamay  <yaniv@qumranet.com>
+ *   Avi Kivity   <avi@qumranet.com>
+ *   Marcelo Tosatti <mtosatti@redhat.com>
+ *   Paolo Bonzini <pbonzini@redhat.com>
+ *   Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E		(1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE		(1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK	(0xff)
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+	switch (msr) {
+	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+	case MSR_MTRRfix64K_00000:
+	case MSR_MTRRfix16K_80000:
+	case MSR_MTRRfix16K_A0000:
+	case MSR_MTRRfix4K_C0000:
+	case MSR_MTRRfix4K_C8000:
+	case MSR_MTRRfix4K_D0000:
+	case MSR_MTRRfix4K_D8000:
+	case MSR_MTRRfix4K_E0000:
+	case MSR_MTRRfix4K_E8000:
+	case MSR_MTRRfix4K_F0000:
+	case MSR_MTRRfix4K_F8000:
+	case MSR_MTRRdefType:
+	case MSR_IA32_CR_PAT:
+		return true;
+	case 0x2f8:
+		return true;
+	}
+	return false;
+}
+
+static bool valid_pat_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	int i;
+	u64 mask;
+
+	if (!msr_mtrr_valid(msr))
+		return false;
+
+	if (msr == MSR_IA32_CR_PAT) {
+		for (i = 0; i < 8; i++)
+			if (!valid_pat_type((data >> (i * 8)) & 0xff))
+				return false;
+		return true;
+	} else if (msr == MSR_MTRRdefType) {
+		if (data & ~0xcff)
+			return false;
+		return valid_mtrr_type(data & 0xff);
+	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+		for (i = 0; i < 8 ; i++)
+			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+				return false;
+		return true;
+	}
+
+	/* variable MTRRs */
+	WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+	mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+	if ((msr & 1) == 0) {
+		/* MTRR base */
+		if (!valid_mtrr_type(data & 0xff))
+			return false;
+		mask |= 0xf00;
+	} else
+		/* MTRR mask */
+		mask |= 0x7ff;
+	if (data & mask) {
+		kvm_inject_gp(vcpu, 0);
+		return false;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+	return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+	return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+	return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+	u64 start;
+	u64 end;
+
+	int range_shift;
+
+	/* the start position in kvm_mtrr.fixed_ranges[]. */
+	int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+	/* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+	{
+		.start = 0x0,
+		.end = 0x80000,
+		.range_shift = 16, /* 64K */
+		.range_start = 0,
+	},
+
+	/*
+	 * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+	 * 16K fixed mtrr.
+	 */
+	{
+		.start = 0x80000,
+		.end = 0xc0000,
+		.range_shift = 14, /* 16K */
+		.range_start = 8,
+	},
+
+	/*
+	 * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+	 * 4K fixed mtrr.
+	 */
+	{
+		.start = 0xc0000,
+		.end = 0x100000,
+		.range_shift = 12, /* 12K */
+		.range_start = 24,
+	}
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+	return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+	switch (msr) {
+	case MSR_MTRRfix64K_00000:
+		*seg = 0;
+		*unit = 0;
+		break;
+	case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+		*seg = 1;
+		*unit = msr - MSR_MTRRfix16K_80000;
+		break;
+	case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+		*seg = 2;
+		*unit = msr - MSR_MTRRfix4K_C0000;
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+	u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+	*start = mtrr_seg->start + unit * unit_size;
+	*end = *start + unit_size;
+	WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+	WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+		> mtrr_seg->end);
+
+	/* each unit has 8 ranges. */
+	return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+	int n;
+
+	n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+	return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+	int seg, unit;
+
+	if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+		return false;
+
+	fixed_mtrr_seg_unit_range(seg, unit, start, end);
+	return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+	int seg, unit;
+
+	if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+		return -1;
+
+	return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+	struct fixed_mtrr_segment *mtrr_seg;
+	int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+	for (seg = 0; seg < seg_num; seg++) {
+		mtrr_seg = &fixed_seg_table[seg];
+		if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+			return seg;
+	}
+
+	return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+	struct fixed_mtrr_segment *mtrr_seg;
+	int index;
+
+	mtrr_seg = &fixed_seg_table[seg];
+	index = mtrr_seg->range_start;
+	index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+	return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+	struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+	int pos = index - mtrr_seg->range_start;
+
+	return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+	u64 mask;
+
+	*start = range->base & PAGE_MASK;
+
+	mask = range->mask & PAGE_MASK;
+	mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
+
+	/* This cannot overflow because writing to the reserved bits of
+	 * variable MTRRs causes a #GP.
+	 */
+	*end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	gfn_t start, end;
+	int index;
+
+	if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+	      !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+		return;
+
+	if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+		return;
+
+	/* fixed MTRRs. */
+	if (fixed_msr_to_range(msr, &start, &end)) {
+		if (!fixed_mtrr_is_enabled(mtrr_state))
+			return;
+	} else if (msr == MSR_MTRRdefType) {
+		start = 0x0;
+		end = ~0ULL;
+	} else {
+		/* variable range MTRRs. */
+		index = (msr - 0x200) / 2;
+		var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+	}
+
+	kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+	return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	struct kvm_mtrr_range *tmp, *cur;
+	int index, is_mtrr_mask;
+
+	index = (msr - 0x200) / 2;
+	is_mtrr_mask = msr - 0x200 - 2 * index;
+	cur = &mtrr_state->var_ranges[index];
+
+	/* remove the entry if it's in the list. */
+	if (var_mtrr_range_is_valid(cur))
+		list_del(&mtrr_state->var_ranges[index].node);
+
+	if (!is_mtrr_mask)
+		cur->base = data;
+	else
+		cur->mask = data;
+
+	/* add it to the list if it's enabled. */
+	if (var_mtrr_range_is_valid(cur)) {
+		list_for_each_entry(tmp, &mtrr_state->head, node)
+			if (cur->base >= tmp->base)
+				break;
+		list_add_tail(&cur->node, &tmp->node);
+	}
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+	int index;
+
+	if (!kvm_mtrr_valid(vcpu, msr, data))
+		return 1;
+
+	index = fixed_msr_to_range_index(msr);
+	if (index >= 0)
+		*(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+	else if (msr == MSR_MTRRdefType)
+		vcpu->arch.mtrr_state.deftype = data;
+	else if (msr == MSR_IA32_CR_PAT)
+		vcpu->arch.pat = data;
+	else
+		set_var_mtrr_msr(vcpu, msr, data);
+
+	update_mtrr(vcpu, msr);
+	return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+	int index;
+
+	/* MSR_MTRRcap is a readonly MSR. */
+	if (msr == MSR_MTRRcap) {
+		/*
+		 * SMRR = 0
+		 * WC = 1
+		 * FIX = 1
+		 * VCNT = KVM_NR_VAR_MTRR
+		 */
+		*pdata = 0x500 | KVM_NR_VAR_MTRR;
+		return 0;
+	}
+
+	if (!msr_mtrr_valid(msr))
+		return 1;
+
+	index = fixed_msr_to_range_index(msr);
+	if (index >= 0)
+		*pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+	else if (msr == MSR_MTRRdefType)
+		*pdata = vcpu->arch.mtrr_state.deftype;
+	else if (msr == MSR_IA32_CR_PAT)
+		*pdata = vcpu->arch.pat;
+	else {	/* Variable MTRRs */
+		int is_mtrr_mask;
+
+		index = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * index;
+		if (!is_mtrr_mask)
+			*pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+		else
+			*pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+	}
+
+	return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+	INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+	/* input fields. */
+	struct kvm_mtrr *mtrr_state;
+	u64 start;
+	u64 end;
+
+	/* output fields. */
+	int mem_type;
+	/* [start, end) is not fully covered in MTRRs? */
+	bool partial_map;
+
+	/* private fields. */
+	union {
+		/* used for fixed MTRRs. */
+		struct {
+			int index;
+			int seg;
+		};
+
+		/* used for var MTRRs. */
+		struct {
+			struct kvm_mtrr_range *range;
+			/* max address has been covered in var MTRRs. */
+			u64 start_max;
+		};
+	};
+
+	bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+	int seg, index;
+
+	if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+		return false;
+
+	seg = fixed_mtrr_addr_to_seg(iter->start);
+	if (seg < 0)
+		return false;
+
+	iter->fixed = true;
+	index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+	iter->index = index;
+	iter->seg = seg;
+	return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+			    struct kvm_mtrr_range *range)
+{
+	u64 start, end;
+
+	var_mtrr_range(range, &start, &end);
+	if (!(start >= iter->end || end <= iter->start)) {
+		iter->range = range;
+
+		/*
+		 * the function is called when we do kvm_mtrr.head walking.
+		 * Range has the minimum base address which interleaves
+		 * [looker->start_max, looker->end).
+		 */
+		iter->partial_map |= iter->start_max < start;
+
+		/* update the max address has been covered. */
+		iter->start_max = max(iter->start_max, end);
+		return true;
+	}
+
+	return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+	struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+	list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+		if (match_var_range(iter, iter->range))
+			return;
+
+	iter->range = NULL;
+	iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+	struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+	iter->fixed = false;
+	iter->start_max = iter->start;
+	iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+	__mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+	/* terminate the lookup. */
+	if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+		iter->fixed = false;
+		iter->range = NULL;
+		return;
+	}
+
+	iter->index++;
+
+	/* have looked up for all fixed MTRRs. */
+	if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+		return mtrr_lookup_var_start(iter);
+
+	/* switch to next segment. */
+	if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+		iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+	__mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+	if (!mtrr_is_enabled(iter->mtrr_state)) {
+		iter->partial_map = true;
+		return;
+	}
+
+	if (!mtrr_lookup_fixed_start(iter))
+		mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+			     struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+	iter->mtrr_state = mtrr_state;
+	iter->start = start;
+	iter->end = end;
+	iter->partial_map = false;
+	iter->fixed = false;
+	iter->range = NULL;
+
+	mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+	if (iter->fixed) {
+		iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+		return true;
+	}
+
+	if (iter->range) {
+		iter->mem_type = iter->range->base & 0xff;
+		return true;
+	}
+
+	return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+	if (iter->fixed)
+		mtrr_lookup_fixed_next(iter);
+	else
+		mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+	for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+	     mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	struct mtrr_iter iter;
+	u64 start, end;
+	int type = -1;
+	const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+			       | (1 << MTRR_TYPE_WRTHROUGH);
+
+	start = gfn_to_gpa(gfn);
+	end = start + PAGE_SIZE;
+
+	mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+		int curr_type = iter.mem_type;
+
+		/*
+		 * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+		 * Precedences.
+		 */
+
+		if (type == -1) {
+			type = curr_type;
+			continue;
+		}
+
+		/*
+		 * If two or more variable memory ranges match and the
+		 * memory types are identical, then that memory type is
+		 * used.
+		 */
+		if (type == curr_type)
+			continue;
+
+		/*
+		 * If two or more variable memory ranges match and one of
+		 * the memory types is UC, the UC memory type used.
+		 */
+		if (curr_type == MTRR_TYPE_UNCACHABLE)
+			return MTRR_TYPE_UNCACHABLE;
+
+		/*
+		 * If two or more variable memory ranges match and the
+		 * memory types are WT and WB, the WT memory type is used.
+		 */
+		if (((1 << type) & wt_wb_mask) &&
+		      ((1 << curr_type) & wt_wb_mask)) {
+			type = MTRR_TYPE_WRTHROUGH;
+			continue;
+		}
+
+		/*
+		 * For overlaps not defined by the above rules, processor
+		 * behavior is undefined.
+		 */
+
+		/* We use WB for this undefined behavior. :( */
+		return MTRR_TYPE_WRBACK;
+	}
+
+	/* It is not covered by MTRRs. */
+	if (iter.partial_map) {
+		/*
+		 * We just check one page, partially covered by MTRRs is
+		 * impossible.
+		 */
+		WARN_ON(type != -1);
+		type = mtrr_default_type(mtrr_state);
+	}
+	return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+					  int page_num)
+{
+	struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+	struct mtrr_iter iter;
+	u64 start, end;
+	int type = -1;
+
+	start = gfn_to_gpa(gfn);
+	end = gfn_to_gpa(gfn + page_num);
+	mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+		if (type == -1) {
+			type = iter.mem_type;
+			continue;
+		}
+
+		if (type != iter.mem_type)
+			return false;
+	}
+
+	if (!iter.partial_map)
+		return true;
+
+	if (type == -1)
+		return true;
+
+	return type == mtrr_default_type(mtrr_state);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index fd49c867b25a..0f67d7e24800 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
 		if (ret)
 			return ret;
 
-		mark_page_dirty(vcpu->kvm, table_gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
 		walker->ptes[level] = pte;
 	}
 	return 0;
@@ -338,7 +338,7 @@ retry_walk:
 
 		real_gfn = gpa_to_gfn(real_gfn);
 
-		host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
+		host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
 					    &walker->pte_writable[walker->level - 1]);
 		if (unlikely(kvm_is_error_hva(host_addr)))
 			goto error;
@@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
 		base_gpa = pte_gpa & ~mask;
 		index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
 
-		r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+		r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
 				gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
 		curr_pte = gw->prefetch_ptes[index];
 	} else
-		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+		r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
 				  &curr_pte, sizeof(curr_pte));
 
 	return r || curr_pte != gw->ptes[level - 1];
@@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 					      mmu_is_nested(vcpu));
 		if (likely(r != RET_MMIO_PF_INVALID))
 			return r;
+
+		/*
+		 * page fault with PFEC.RSVD  = 1 is caused by shadow
+		 * page fault, should not be used to walk guest page
+		 * table.
+		 */
+		error_code &= ~PFERR_RSVD_MASK;
 	};
 
 	r = mmu_topup_memory_caches(vcpu);
@@ -862,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			if (!rmap_can_add(vcpu))
 				break;
 
-			if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
-						  sizeof(pt_element_t)))
+			if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+						       sizeof(pt_element_t)))
 				break;
 
 			FNAME(update_pte)(vcpu, sp, sptep, &gpte);
@@ -949,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
 
-		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
-					  sizeof(pt_element_t)))
+		if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+					       sizeof(pt_element_t)))
 			return -EINVAL;
 
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
@@ -963,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		pte_access &= FNAME(gpte_access)(vcpu, gpte);
 		FNAME(protect_clean_gpte)(&pte_access, gpte);
 
-		if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
+		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
 		      &nr_present))
 			continue;
 
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 29fbf9dfdc54..31aa2c85dc97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,11 +1,12 @@
 /*
  * Kernel-based Virtual Machine -- Performance Monitoring Unit support
  *
- * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
  *
  * Authors:
  *   Avi Kivity   <avi@redhat.com>
  *   Gleb Natapov <gleb@redhat.com>
+ *   Wei Huang    <wei@redhat.com>
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
@@ -19,88 +20,39 @@
 #include "x86.h"
 #include "cpuid.h"
 #include "lapic.h"
+#include "pmu.h"
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ *   gp counters are stored in gp_counters[] and fixed counters are stored
+ *   in fixed_counters[] respectively. Both of them are part of "struct
+ *   kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ *   However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ *     1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ *        has MSR_K7_PERFCTRn.
+ *     2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ *        For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ *        C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ *        that it also supports fixed counters. idx can be used to as index to
+ *        gp and fixed counters.
+ *     3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ *        code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ *        all perf counters (both gp and fixed). The mapping relationship
+ *        between pmc and perf counters is as the following:
+ *        * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ *                 [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ *        * AMD:   [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ */
 
-static struct kvm_arch_event_perf_mapping {
-	u8 eventsel;
-	u8 unit_mask;
-	unsigned event_type;
-	bool inexact;
-} arch_events[] = {
-	/* Index must match CPUID 0x0A.EBX bit vector */
-	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
-	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
-	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
-	[3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
-	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
-	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
-	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
-	[7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
-};
-
-/* mapping between fixed pmc index and arch_events array */
-static int fixed_pmc_events[] = {1, 0, 7};
-
-static bool pmc_is_gp(struct kvm_pmc *pmc)
-{
-	return pmc->type == KVM_PMC_GP;
-}
-
-static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
-{
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-
-	return pmu->counter_bitmask[pmc->type];
-}
-
-static inline bool pmc_enabled(struct kvm_pmc *pmc)
-{
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
-					 u32 base)
-{
-	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
-		return &pmu->gp_counters[msr - base];
-	return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
-{
-	int base = MSR_CORE_PERF_FIXED_CTR0;
-	if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
-		return &pmu->fixed_counters[msr - base];
-	return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
-{
-	return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
-}
-
-static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
-{
-	if (idx < INTEL_PMC_IDX_FIXED)
-		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
-	else
-		return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
-}
-
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.apic)
-		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
-}
-
-static void trigger_pmi(struct irq_work *irq_work)
+static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
 {
-	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
-			irq_work);
-	struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
-			arch.pmu);
+	struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+	struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
 
-	kvm_deliver_pmi(vcpu);
+	kvm_pmu_deliver_pmi(vcpu);
 }
 
 static void kvm_perf_overflow(struct perf_event *perf_event,
@@ -108,63 +60,46 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
 			      struct pt_regs *regs)
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-	if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	if (!test_and_set_bit(pmc->idx,
+			      (unsigned long *)&pmu->reprogram_pmi)) {
 		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
 	}
 }
 
 static void kvm_perf_overflow_intr(struct perf_event *perf_event,
-		struct perf_sample_data *data, struct pt_regs *regs)
+				   struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
-	struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-	if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	if (!test_and_set_bit(pmc->idx,
+			      (unsigned long *)&pmu->reprogram_pmi)) {
 		__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
 		kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
 		/*
 		 * Inject PMI. If vcpu was in a guest mode during NMI PMI
 		 * can be ejected on a guest mode re-entry. Otherwise we can't
 		 * be sure that vcpu wasn't executing hlt instruction at the
-		 * time of vmexit and is not going to re-enter guest mode until,
+		 * time of vmexit and is not going to re-enter guest mode until
 		 * woken up. So we should wake it, but this is impossible from
 		 * NMI context. Do it from irq work instead.
 		 */
 		if (!kvm_is_in_guest())
-			irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+			irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
 		else
 			kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
 	}
 }
 
-static u64 read_pmc(struct kvm_pmc *pmc)
-{
-	u64 counter, enabled, running;
-
-	counter = pmc->counter;
-
-	if (pmc->perf_event)
-		counter += perf_event_read_value(pmc->perf_event,
-						 &enabled, &running);
-
-	/* FIXME: Scaling needed? */
-
-	return counter & pmc_bitmask(pmc);
-}
-
-static void stop_counter(struct kvm_pmc *pmc)
-{
-	if (pmc->perf_event) {
-		pmc->counter = read_pmc(pmc);
-		perf_event_release_kernel(pmc->perf_event);
-		pmc->perf_event = NULL;
-	}
-}
-
-static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
-		unsigned config, bool exclude_user, bool exclude_kernel,
-		bool intr, bool in_tx, bool in_tx_cp)
+static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
+				  unsigned config, bool exclude_user,
+				  bool exclude_kernel, bool intr,
+				  bool in_tx, bool in_tx_cp)
 {
 	struct perf_event *event;
 	struct perf_event_attr attr = {
@@ -177,6 +112,7 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 		.exclude_kernel = exclude_kernel,
 		.config = config,
 	};
+
 	if (in_tx)
 		attr.config |= HSW_IN_TX;
 	if (in_tx_cp)
@@ -188,33 +124,16 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
 						 intr ? kvm_perf_overflow_intr :
 						 kvm_perf_overflow, pmc);
 	if (IS_ERR(event)) {
-		printk_once("kvm: pmu event creation failed %ld\n",
-				PTR_ERR(event));
+		printk_once("kvm_pmu: event creation failed %ld\n",
+			    PTR_ERR(event));
 		return;
 	}
 
 	pmc->perf_event = event;
-	clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
-}
-
-static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
-		u8 unit_mask)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(arch_events); i++)
-		if (arch_events[i].eventsel == event_select
-				&& arch_events[i].unit_mask == unit_mask
-				&& (pmu->available_event_types & (1 << i)))
-			break;
-
-	if (i == ARRAY_SIZE(arch_events))
-		return PERF_COUNT_HW_MAX;
-
-	return arch_events[i].event_type;
+	clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
 }
 
-static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 {
 	unsigned config, type = PERF_TYPE_RAW;
 	u8 event_select, unit_mask;
@@ -224,21 +143,22 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 
 	pmc->eventsel = eventsel;
 
-	stop_counter(pmc);
+	pmc_stop_counter(pmc);
 
-	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+	if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
 		return;
 
 	event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
 	unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
 
 	if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
-				ARCH_PERFMON_EVENTSEL_INV |
-				ARCH_PERFMON_EVENTSEL_CMASK |
-				HSW_IN_TX |
-				HSW_IN_TX_CHECKPOINTED))) {
-		config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
-				unit_mask);
+			  ARCH_PERFMON_EVENTSEL_INV |
+			  ARCH_PERFMON_EVENTSEL_CMASK |
+			  HSW_IN_TX |
+			  HSW_IN_TX_CHECKPOINTED))) {
+		config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
+						      event_select,
+						      unit_mask);
 		if (config != PERF_COUNT_HW_MAX)
 			type = PERF_TYPE_HARDWARE;
 	}
@@ -246,56 +166,36 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
 	if (type == PERF_TYPE_RAW)
 		config = eventsel & X86_RAW_EVENT_MASK;
 
-	reprogram_counter(pmc, type, config,
-			!(eventsel & ARCH_PERFMON_EVENTSEL_USR),
-			!(eventsel & ARCH_PERFMON_EVENTSEL_OS),
-			eventsel & ARCH_PERFMON_EVENTSEL_INT,
-			(eventsel & HSW_IN_TX),
-			(eventsel & HSW_IN_TX_CHECKPOINTED));
+	pmc_reprogram_counter(pmc, type, config,
+			      !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+			      !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+			      eventsel & ARCH_PERFMON_EVENTSEL_INT,
+			      (eventsel & HSW_IN_TX),
+			      (eventsel & HSW_IN_TX_CHECKPOINTED));
 }
+EXPORT_SYMBOL_GPL(reprogram_gp_counter);
 
-static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
 {
-	unsigned en = en_pmi & 0x3;
-	bool pmi = en_pmi & 0x8;
+	unsigned en_field = ctrl & 0x3;
+	bool pmi = ctrl & 0x8;
 
-	stop_counter(pmc);
+	pmc_stop_counter(pmc);
 
-	if (!en || !pmc_enabled(pmc))
+	if (!en_field || !pmc_is_enabled(pmc))
 		return;
 
-	reprogram_counter(pmc, PERF_TYPE_HARDWARE,
-			arch_events[fixed_pmc_events[idx]].event_type,
-			!(en & 0x2), /* exclude user */
-			!(en & 0x1), /* exclude kernel */
-			pmi, false, false);
+	pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+			      kvm_x86_ops->pmu_ops->find_fixed_event(idx),
+			      !(en_field & 0x2), /* exclude user */
+			      !(en_field & 0x1), /* exclude kernel */
+			      pmi, false, false);
 }
+EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
 
-static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
 {
-	return (ctrl >> (idx * 4)) & 0xf;
-}
-
-static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
-{
-	int i;
-
-	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
-		u8 en_pmi = fixed_en_pmi(data, i);
-		struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
-
-		if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
-			continue;
-
-		reprogram_fixed_counter(pmc, en_pmi, i);
-	}
-
-	pmu->fixed_ctr_ctrl = data;
-}
-
-static void reprogram_idx(struct kvm_pmu *pmu, int idx)
-{
-	struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+	struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
 
 	if (!pmc)
 		return;
@@ -303,274 +203,107 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
 	if (pmc_is_gp(pmc))
 		reprogram_gp_counter(pmc, pmc->eventsel);
 	else {
-		int fidx = idx - INTEL_PMC_IDX_FIXED;
-		reprogram_fixed_counter(pmc,
-				fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+		int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+		u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+
+		reprogram_fixed_counter(pmc, ctrl, idx);
 	}
 }
+EXPORT_SYMBOL_GPL(reprogram_counter);
 
-static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
 {
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	u64 bitmask;
 	int bit;
-	u64 diff = pmu->global_ctrl ^ data;
-
-	pmu->global_ctrl = data;
-
-	for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
-		reprogram_idx(pmu, bit);
-}
 
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	int ret;
-
-	switch (msr) {
-	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-	case MSR_CORE_PERF_GLOBAL_STATUS:
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		ret = pmu->version > 1;
-		break;
-	default:
-		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
-			|| get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
-			|| get_fixed_pmc(pmu, msr);
-		break;
-	}
-	return ret;
-}
+	bitmask = pmu->reprogram_pmi;
 
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
+	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+		struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
 
-	switch (index) {
-	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-		*data = pmu->fixed_ctr_ctrl;
-		return 0;
-	case MSR_CORE_PERF_GLOBAL_STATUS:
-		*data = pmu->global_status;
-		return 0;
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-		*data = pmu->global_ctrl;
-		return 0;
-	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		*data = pmu->global_ovf_ctrl;
-		return 0;
-	default:
-		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
-				(pmc = get_fixed_pmc(pmu, index))) {
-			*data = read_pmc(pmc);
-			return 0;
-		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
-			*data = pmc->eventsel;
-			return 0;
+		if (unlikely(!pmc || !pmc->perf_event)) {
+			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+			continue;
 		}
-	}
-	return 1;
-}
 
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_pmc *pmc;
-	u32 index = msr_info->index;
-	u64 data = msr_info->data;
-
-	switch (index) {
-	case MSR_CORE_PERF_FIXED_CTR_CTRL:
-		if (pmu->fixed_ctr_ctrl == data)
-			return 0;
-		if (!(data & 0xfffffffffffff444ull)) {
-			reprogram_fixed_counters(pmu, data);
-			return 0;
-		}
-		break;
-	case MSR_CORE_PERF_GLOBAL_STATUS:
-		if (msr_info->host_initiated) {
-			pmu->global_status = data;
-			return 0;
-		}
-		break; /* RO MSR */
-	case MSR_CORE_PERF_GLOBAL_CTRL:
-		if (pmu->global_ctrl == data)
-			return 0;
-		if (!(data & pmu->global_ctrl_mask)) {
-			global_ctrl_changed(pmu, data);
-			return 0;
-		}
-		break;
-	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
-			if (!msr_info->host_initiated)
-				pmu->global_status &= ~data;
-			pmu->global_ovf_ctrl = data;
-			return 0;
-		}
-		break;
-	default:
-		if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
-				(pmc = get_fixed_pmc(pmu, index))) {
-			if (!msr_info->host_initiated)
-				data = (s64)(s32)data;
-			pmc->counter += data - read_pmc(pmc);
-			return 0;
-		} else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
-			if (data == pmc->eventsel)
-				return 0;
-			if (!(data & pmu->reserved_bits)) {
-				reprogram_gp_counter(pmc, data);
-				return 0;
-			}
-		}
+		reprogram_counter(pmu, bit);
 	}
-	return 1;
 }
 
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+/* check if idx is a valid index to access PMU */
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool fixed = pmc & (1u << 30);
-	pmc &= ~(3u << 30);
-	return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
-		(fixed && pmc >= pmu->nr_arch_fixed_counters);
+	return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
 }
 
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	bool fast_mode = pmc & (1u << 31);
-	bool fixed = pmc & (1u << 30);
-	struct kvm_pmc *counters;
-	u64 ctr;
-
-	pmc &= ~(3u << 30);
-	if (!fixed && pmc >= pmu->nr_arch_gp_counters)
-		return 1;
-	if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+	bool fast_mode = idx & (1u << 31);
+	struct kvm_pmc *pmc;
+	u64 ctr_val;
+
+	pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
+	if (!pmc)
 		return 1;
-	counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
-	ctr = read_pmc(&counters[pmc]);
+
+	ctr_val = pmc_read_counter(pmc);
 	if (fast_mode)
-		ctr = (u32)ctr;
-	*data = ctr;
+		ctr_val = (u32)ctr_val;
 
+	*data = ctr_val;
 	return 0;
 }
 
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	struct kvm_cpuid_entry2 *entry;
-	union cpuid10_eax eax;
-	union cpuid10_edx edx;
-
-	pmu->nr_arch_gp_counters = 0;
-	pmu->nr_arch_fixed_counters = 0;
-	pmu->counter_bitmask[KVM_PMC_GP] = 0;
-	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
-	pmu->version = 0;
-	pmu->reserved_bits = 0xffffffff00200000ull;
-
-	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
-	if (!entry)
-		return;
-	eax.full = entry->eax;
-	edx.full = entry->edx;
-
-	pmu->version = eax.split.version_id;
-	if (!pmu->version)
-		return;
-
-	pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
-					INTEL_PMC_MAX_GENERIC);
-	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
-	pmu->available_event_types = ~entry->ebx &
-					((1ull << eax.split.mask_length) - 1);
-
-	if (pmu->version == 1) {
-		pmu->nr_arch_fixed_counters = 0;
-	} else {
-		pmu->nr_arch_fixed_counters =
-			min_t(int, edx.split.num_counters_fixed,
-				INTEL_PMC_MAX_FIXED);
-		pmu->counter_bitmask[KVM_PMC_FIXED] =
-			((u64)1 << edx.split.bit_width_fixed) - 1;
-	}
+	if (vcpu->arch.apic)
+		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
 
-	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
-		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
-	pmu->global_ctrl_mask = ~pmu->global_ctrl;
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
 
-	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
-	if (entry &&
-	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
-	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
-		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
 }
 
-void kvm_pmu_init(struct kvm_vcpu *vcpu)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	int i;
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
+	return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
+}
 
-	memset(pmu, 0, sizeof(*pmu));
-	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
-		pmu->gp_counters[i].type = KVM_PMC_GP;
-		pmu->gp_counters[i].vcpu = vcpu;
-		pmu->gp_counters[i].idx = i;
-	}
-	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
-		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
-		pmu->fixed_counters[i].vcpu = vcpu;
-		pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
-	}
-	init_irq_work(&pmu->irq_work, trigger_pmi);
-	kvm_pmu_cpuid_update(vcpu);
+/* refresh PMU settings. This function generally is called when underlying
+ * settings are changed (such as changes of PMU CPUID by guest VMs), which
+ * should rarely happen.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->pmu_ops->refresh(vcpu);
 }
 
 void kvm_pmu_reset(struct kvm_vcpu *vcpu)
 {
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	int i;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
 	irq_work_sync(&pmu->irq_work);
-	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
-		struct kvm_pmc *pmc = &pmu->gp_counters[i];
-		stop_counter(pmc);
-		pmc->counter = pmc->eventsel = 0;
-	}
+	kvm_x86_ops->pmu_ops->reset(vcpu);
+}
 
-	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
-		stop_counter(&pmu->fixed_counters[i]);
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
 
-	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
-		pmu->global_ovf_ctrl = 0;
+	memset(pmu, 0, sizeof(*pmu));
+	kvm_x86_ops->pmu_ops->init(vcpu);
+	init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+	kvm_pmu_refresh(vcpu);
 }
 
 void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
 {
 	kvm_pmu_reset(vcpu);
 }
-
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pmu *pmu = &vcpu->arch.pmu;
-	u64 bitmask;
-	int bit;
-
-	bitmask = pmu->reprogram_pmi;
-
-	for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
-		struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
-
-		if (unlikely(!pmc || !pmc->perf_event)) {
-			clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
-			continue;
-		}
-
-		reprogram_idx(pmu, bit);
-	}
-}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000000..f96e1f962587
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,118 @@
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu)  (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc)   (&(pmc)->vcpu->arch.pmu)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+struct kvm_event_hw_type_mapping {
+	u8 eventsel;
+	u8 unit_mask;
+	unsigned event_type;
+};
+
+struct kvm_pmu_ops {
+	unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
+				    u8 unit_mask);
+	unsigned (*find_fixed_event)(int idx);
+	bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
+	struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+	struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
+	int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+	bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+	int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+	void (*refresh)(struct kvm_vcpu *vcpu);
+	void (*init)(struct kvm_vcpu *vcpu);
+	void (*reset)(struct kvm_vcpu *vcpu);
+};
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+	u64 counter, enabled, running;
+
+	counter = pmc->counter;
+	if (pmc->perf_event)
+		counter += perf_event_read_value(pmc->perf_event,
+						 &enabled, &running);
+	/* FIXME: Scaling needed? */
+	return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+	if (pmc->perf_event) {
+		pmc->counter = pmc_read_counter(pmc);
+		perf_event_release_kernel(pmc->perf_event);
+		pmc->perf_event = NULL;
+	}
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+	return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+	return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+	return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * paramenter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+					 u32 base)
+{
+	if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+		return &pmu->gp_counters[msr - base];
+
+	return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+	int base = MSR_CORE_PERF_FIXED_CTR0;
+
+	if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+		return &pmu->fixed_counters[msr - base];
+
+	return NULL;
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000000..886aa25a7131
--- /dev/null
+++ b/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,207 @@
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ *   Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* duplicated from amd_perfmon_event_map, K7 and above should work. */
+static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
+	[0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
+	[3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },
+	[4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	[5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+	[6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+	[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
+				    u8 event_select,
+				    u8 unit_mask)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+		if (amd_event_mapping[i].eventsel == event_select
+		    && amd_event_mapping[i].unit_mask == unit_mask)
+			break;
+
+	if (i == ARRAY_SIZE(amd_event_mapping))
+		return PERF_COUNT_HW_MAX;
+
+	return amd_event_mapping[i].event_type;
+}
+
+/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+static unsigned amd_find_fixed_event(int idx)
+{
+	return PERF_COUNT_HW_MAX;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+	return true;
+}
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+	return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	idx &= ~(3u << 30);
+
+	return (idx >= pmu->nr_arch_gp_counters);
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *counters;
+
+	idx &= ~(3u << 30);
+	if (idx >= pmu->nr_arch_gp_counters)
+		return NULL;
+	counters = pmu->gp_counters;
+
+	return &counters[idx];
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int ret = false;
+
+	ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
+		get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+
+	return ret;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+
+	/* MSR_K7_PERFCTRn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+	if (pmc) {
+		*data = pmc_read_counter(pmc);
+		return 0;
+	}
+	/* MSR_K7_EVNTSELn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+	if (pmc) {
+		*data = pmc->eventsel;
+		return 0;
+	}
+
+	return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
+
+	/* MSR_K7_PERFCTRn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+	if (pmc) {
+		if (!msr_info->host_initiated)
+			data = (s64)data;
+		pmc->counter += data - pmc_read_counter(pmc);
+		return 0;
+	}
+	/* MSR_K7_EVNTSELn */
+	pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+	if (pmc) {
+		if (data == pmc->eventsel)
+			return 0;
+		if (!(data & pmu->reserved_bits)) {
+			reprogram_gp_counter(pmc, data);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+	pmu->reserved_bits = 0xffffffff00200000ull;
+	/* not applicable to AMD; but clean them to prevent any fall out */
+	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+	pmu->nr_arch_fixed_counters = 0;
+	pmu->version = 0;
+	pmu->global_status = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
+		pmu->gp_counters[i].type = KVM_PMC_GP;
+		pmu->gp_counters[i].vcpu = vcpu;
+		pmu->gp_counters[i].idx = i;
+	}
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
+		struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+		pmc_stop_counter(pmc);
+		pmc->counter = pmc->eventsel = 0;
+	}
+}
+
+struct kvm_pmu_ops amd_pmu_ops = {
+	.find_arch_event = amd_find_arch_event,
+	.find_fixed_event = amd_find_fixed_event,
+	.pmc_is_enabled = amd_pmc_is_enabled,
+	.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+	.msr_idx_to_pmc = amd_msr_idx_to_pmc,
+	.is_valid_msr_idx = amd_is_valid_msr_idx,
+	.is_valid_msr = amd_is_valid_msr,
+	.get_msr = amd_pmu_get_msr,
+	.set_msr = amd_pmu_set_msr,
+	.refresh = amd_pmu_refresh,
+	.init = amd_pmu_init,
+	.reset = amd_pmu_reset,
+};
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000000..ab38af4f4947
--- /dev/null
+++ b/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,358 @@
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ *   Avi Kivity   <avi@redhat.com>
+ *   Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+static struct kvm_event_hw_type_mapping intel_arch_events[] = {
+	/* Index must match CPUID 0x0A.EBX bit vector */
+	[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+	[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+	[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES  },
+	[3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+	[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+	[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+	[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+	[7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {1, 0, 7};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+	int i;
+
+	for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+		u8 new_ctrl = fixed_ctrl_field(data, i);
+		u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
+		struct kvm_pmc *pmc;
+
+		pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+		if (old_ctrl == new_ctrl)
+			continue;
+
+		reprogram_fixed_counter(pmc, new_ctrl, i);
+	}
+
+	pmu->fixed_ctr_ctrl = data;
+}
+
+/* function is called when global control register has been updated. */
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+	int bit;
+	u64 diff = pmu->global_ctrl ^ data;
+
+	pmu->global_ctrl = data;
+
+	for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+		reprogram_counter(pmu, bit);
+}
+
+static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
+				      u8 event_select,
+				      u8 unit_mask)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+		if (intel_arch_events[i].eventsel == event_select
+		    && intel_arch_events[i].unit_mask == unit_mask
+		    && (pmu->available_event_types & (1 << i)))
+			break;
+
+	if (i == ARRAY_SIZE(intel_arch_events))
+		return PERF_COUNT_HW_MAX;
+
+	return intel_arch_events[i].event_type;
+}
+
+static unsigned intel_find_fixed_event(int idx)
+{
+	if (idx >= ARRAY_SIZE(fixed_pmc_events))
+		return PERF_COUNT_HW_MAX;
+
+	return intel_arch_events[fixed_pmc_events[idx]].event_type;
+}
+
+/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+	return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+	if (pmc_idx < INTEL_PMC_IDX_FIXED)
+		return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+				  MSR_P6_EVNTSEL0);
+	else {
+		u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+		return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+	}
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	bool fixed = idx & (1u << 30);
+
+	idx &= ~(3u << 30);
+
+	return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
+		(fixed && idx >= pmu->nr_arch_fixed_counters);
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
+					    unsigned idx)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	bool fixed = idx & (1u << 30);
+	struct kvm_pmc *counters;
+
+	idx &= ~(3u << 30);
+	if (!fixed && idx >= pmu->nr_arch_gp_counters)
+		return NULL;
+	if (fixed && idx >= pmu->nr_arch_fixed_counters)
+		return NULL;
+	counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+
+	return &counters[idx];
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int ret;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		ret = pmu->version > 1;
+		break;
+	default:
+		ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+			get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+			get_fixed_pmc(pmu, msr);
+		break;
+	}
+
+	return ret;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		*data = pmu->fixed_ctr_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		*data = pmu->global_status;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		*data = pmu->global_ctrl;
+		return 0;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		*data = pmu->global_ovf_ctrl;
+		return 0;
+	default:
+		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+		    (pmc = get_fixed_pmc(pmu, msr))) {
+			*data = pmc_read_counter(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+			*data = pmc->eventsel;
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_pmc *pmc;
+	u32 msr = msr_info->index;
+	u64 data = msr_info->data;
+
+	switch (msr) {
+	case MSR_CORE_PERF_FIXED_CTR_CTRL:
+		if (pmu->fixed_ctr_ctrl == data)
+			return 0;
+		if (!(data & 0xfffffffffffff444ull)) {
+			reprogram_fixed_counters(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_STATUS:
+		if (msr_info->host_initiated) {
+			pmu->global_status = data;
+			return 0;
+		}
+		break; /* RO MSR */
+	case MSR_CORE_PERF_GLOBAL_CTRL:
+		if (pmu->global_ctrl == data)
+			return 0;
+		if (!(data & pmu->global_ctrl_mask)) {
+			global_ctrl_changed(pmu, data);
+			return 0;
+		}
+		break;
+	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+		if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+			if (!msr_info->host_initiated)
+				pmu->global_status &= ~data;
+			pmu->global_ovf_ctrl = data;
+			return 0;
+		}
+		break;
+	default:
+		if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+		    (pmc = get_fixed_pmc(pmu, msr))) {
+			if (!msr_info->host_initiated)
+				data = (s64)(s32)data;
+			pmc->counter += data - pmc_read_counter(pmc);
+			return 0;
+		} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+			if (data == pmc->eventsel)
+				return 0;
+			if (!(data & pmu->reserved_bits)) {
+				reprogram_gp_counter(pmc, data);
+				return 0;
+			}
+		}
+	}
+
+	return 1;
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	struct kvm_cpuid_entry2 *entry;
+	union cpuid10_eax eax;
+	union cpuid10_edx edx;
+
+	pmu->nr_arch_gp_counters = 0;
+	pmu->nr_arch_fixed_counters = 0;
+	pmu->counter_bitmask[KVM_PMC_GP] = 0;
+	pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+	pmu->version = 0;
+	pmu->reserved_bits = 0xffffffff00200000ull;
+
+	entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+	if (!entry)
+		return;
+	eax.full = entry->eax;
+	edx.full = entry->edx;
+
+	pmu->version = eax.split.version_id;
+	if (!pmu->version)
+		return;
+
+	pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+					INTEL_PMC_MAX_GENERIC);
+	pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+	pmu->available_event_types = ~entry->ebx &
+					((1ull << eax.split.mask_length) - 1);
+
+	if (pmu->version == 1) {
+		pmu->nr_arch_fixed_counters = 0;
+	} else {
+		pmu->nr_arch_fixed_counters =
+			min_t(int, edx.split.num_counters_fixed,
+				INTEL_PMC_MAX_FIXED);
+		pmu->counter_bitmask[KVM_PMC_FIXED] =
+			((u64)1 << edx.split.bit_width_fixed) - 1;
+	}
+
+	pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+		(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+	pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+	entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+	if (entry &&
+	    (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+	    (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+		pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+		pmu->gp_counters[i].type = KVM_PMC_GP;
+		pmu->gp_counters[i].vcpu = vcpu;
+		pmu->gp_counters[i].idx = i;
+	}
+
+	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+		pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+		pmu->fixed_counters[i].vcpu = vcpu;
+		pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+	}
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+	int i;
+
+	for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+		struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+		pmc_stop_counter(pmc);
+		pmc->counter = pmc->eventsel = 0;
+	}
+
+	for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
+		pmc_stop_counter(&pmu->fixed_counters[i]);
+
+	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+		pmu->global_ovf_ctrl = 0;
+}
+
+struct kvm_pmu_ops intel_pmu_ops = {
+	.find_arch_event = intel_find_arch_event,
+	.find_fixed_event = intel_find_fixed_event,
+	.pmc_is_enabled = intel_pmc_is_enabled,
+	.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+	.msr_idx_to_pmc = intel_msr_idx_to_pmc,
+	.is_valid_msr_idx = intel_is_valid_msr_idx,
+	.is_valid_msr = intel_is_valid_msr,
+	.get_msr = intel_pmu_get_msr,
+	.set_msr = intel_pmu_set_msr,
+	.refresh = intel_pmu_refresh,
+	.init = intel_pmu_init,
+	.reset = intel_pmu_reset,
+};
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce741b8650f6..851a9a1c6dfc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -21,6 +21,7 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
+#include "pmu.h"
 
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
@@ -511,8 +512,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (svm->vmcb->control.next_rip != 0)
+	if (svm->vmcb->control.next_rip != 0) {
+		WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
 		svm->next_rip = svm->vmcb->control.next_rip;
+	}
 
 	if (!svm->next_rip) {
 		if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
@@ -1082,7 +1085,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - tsc;
 }
 
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct vcpu_svm *svm, bool init_event)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1156,17 @@ static void init_vmcb(struct vcpu_svm *svm)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
-	svm_set_efer(&svm->vcpu, 0);
+	if (!init_event)
+		svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	kvm_set_rflags(&svm->vcpu, 2);
 	save->rip = 0x0000fff0;
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
 	/*
-	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+	 * It also updates the guest-visible cr0 value.
 	 */
-	svm->vcpu.arch.cr0 = 0;
 	(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
 
 	save->cr4 = X86_CR4_PAE;
@@ -1176,7 +1179,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 		clr_exception_intercept(svm, PF_VECTOR);
 		clr_cr_intercept(svm, INTERCEPT_CR3_READ);
 		clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
-		save->g_pat = 0x0007040600070406ULL;
+		save->g_pat = svm->vcpu.arch.pat;
 		save->cr3 = 0;
 		save->cr4 = 0;
 	}
@@ -1195,13 +1198,19 @@ static void init_vmcb(struct vcpu_svm *svm)
 	enable_gif(svm);
 }
 
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u32 dummy;
 	u32 eax = 1;
 
-	init_vmcb(svm);
+	if (!init_event) {
+		svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+					   MSR_IA32_APICBASE_ENABLE;
+		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	}
+	init_vmcb(svm, init_event);
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1266,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
-	init_vmcb(svm);
-
-	svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
-				   MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
-		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+	init_vmcb(svm, false);
 
 	svm_init_osvw(&svm->vcpu);
 
@@ -1575,7 +1579,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	 * does not do it - this results in some delay at
 	 * reboot
 	 */
-	cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+	if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
+		cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
 	svm->vmcb->save.cr0 = cr0;
 	mark_dirty(svm->vmcb, VMCB_CR);
 	update_cr0_intercept(svm);
@@ -1883,7 +1888,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
 	 * so reinitialize it.
 	 */
 	clear_page(svm->vmcb);
-	init_vmcb(svm);
+	init_vmcb(svm, false);
 
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
@@ -1953,8 +1958,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
 	u64 pdpte;
 	int ret;
 
-	ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
-				  offset_in_page(cr3) + index * 8, 8);
+	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+				       offset_in_page(cr3) + index * 8, 8);
 	if (ret)
 		return 0;
 	return pdpte;
@@ -2112,7 +2117,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 
 	might_sleep();
 
-	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+	page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto error;
 
@@ -2151,7 +2156,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
 	mask = (0xf >> (4 - size)) << start_bit;
 	val = 0;
 
-	if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
 		return NESTED_EXIT_DONE;
 
 	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2176,7 +2181,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	/* Offset is in 32 bit units but need in 8 bit units */
 	offset *= 4;
 
-	if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
 		return NESTED_EXIT_DONE;
 
 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2447,7 +2452,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 		p      = msrpm_offsets[i];
 		offset = svm->nested.vmcb_msrpm + (p * 4);
 
-		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
 			return false;
 
 		svm->nested.msrpm[p] = svm->msrpm[p] | value;
@@ -3067,42 +3072,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 		svm_scale_tsc(vcpu, host_tsc);
 }
 
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	switch (ecx) {
+	switch (msr_info->index) {
 	case MSR_IA32_TSC: {
-		*data = svm->vmcb->control.tsc_offset +
+		msr_info->data = svm->vmcb->control.tsc_offset +
 			svm_scale_tsc(vcpu, native_read_tsc());
 
 		break;
 	}
 	case MSR_STAR:
-		*data = svm->vmcb->save.star;
+		msr_info->data = svm->vmcb->save.star;
 		break;
 #ifdef CONFIG_X86_64
 	case MSR_LSTAR:
-		*data = svm->vmcb->save.lstar;
+		msr_info->data = svm->vmcb->save.lstar;
 		break;
 	case MSR_CSTAR:
-		*data = svm->vmcb->save.cstar;
+		msr_info->data = svm->vmcb->save.cstar;
 		break;
 	case MSR_KERNEL_GS_BASE:
-		*data = svm->vmcb->save.kernel_gs_base;
+		msr_info->data = svm->vmcb->save.kernel_gs_base;
 		break;
 	case MSR_SYSCALL_MASK:
-		*data = svm->vmcb->save.sfmask;
+		msr_info->data = svm->vmcb->save.sfmask;
 		break;
 #endif
 	case MSR_IA32_SYSENTER_CS:
-		*data = svm->vmcb->save.sysenter_cs;
+		msr_info->data = svm->vmcb->save.sysenter_cs;
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		*data = svm->sysenter_eip;
+		msr_info->data = svm->sysenter_eip;
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		*data = svm->sysenter_esp;
+		msr_info->data = svm->sysenter_esp;
 		break;
 	/*
 	 * Nobody will change the following 5 values in the VMCB so we can
@@ -3110,31 +3115,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	 * implemented.
 	 */
 	case MSR_IA32_DEBUGCTLMSR:
-		*data = svm->vmcb->save.dbgctl;
+		msr_info->data = svm->vmcb->save.dbgctl;
 		break;
 	case MSR_IA32_LASTBRANCHFROMIP:
-		*data = svm->vmcb->save.br_from;
+		msr_info->data = svm->vmcb->save.br_from;
 		break;
 	case MSR_IA32_LASTBRANCHTOIP:
-		*data = svm->vmcb->save.br_to;
+		msr_info->data = svm->vmcb->save.br_to;
 		break;
 	case MSR_IA32_LASTINTFROMIP:
-		*data = svm->vmcb->save.last_excp_from;
+		msr_info->data = svm->vmcb->save.last_excp_from;
 		break;
 	case MSR_IA32_LASTINTTOIP:
-		*data = svm->vmcb->save.last_excp_to;
+		msr_info->data = svm->vmcb->save.last_excp_to;
 		break;
 	case MSR_VM_HSAVE_PA:
-		*data = svm->nested.hsave_msr;
+		msr_info->data = svm->nested.hsave_msr;
 		break;
 	case MSR_VM_CR:
-		*data = svm->nested.vm_cr_msr;
+		msr_info->data = svm->nested.vm_cr_msr;
 		break;
 	case MSR_IA32_UCODE_REV:
-		*data = 0x01000065;
+		msr_info->data = 0x01000065;
 		break;
 	default:
-		return kvm_get_msr_common(vcpu, ecx, data);
+		return kvm_get_msr_common(vcpu, msr_info);
 	}
 	return 0;
 }
@@ -3142,16 +3147,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 static int rdmsr_interception(struct vcpu_svm *svm)
 {
 	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
-	u64 data;
+	struct msr_data msr_info;
 
-	if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+	msr_info.index = ecx;
+	msr_info.host_initiated = false;
+	if (svm_get_msr(&svm->vcpu, &msr_info)) {
 		trace_kvm_msr_read_ex(ecx);
 		kvm_inject_gp(&svm->vcpu, 0);
 	} else {
-		trace_kvm_msr_read(ecx, data);
+		trace_kvm_msr_read(ecx, msr_info.data);
 
-		kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
-		kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
+		kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+				   msr_info.data & 0xffffffff);
+		kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+				   msr_info.data >> 32);
 		svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 		skip_emulated_instruction(&svm->vcpu);
 	}
@@ -3388,6 +3397,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
+	[SVM_EXIT_RSM]                          = emulate_on_interception,
 };
 
 static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -4073,6 +4083,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
 	return false;
 }
 
+static bool svm_has_high_real_mode_segbase(void)
+{
+	return true;
+}
+
 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
@@ -4317,7 +4332,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
 		break;
 	}
 
-	vmcb->control.next_rip  = info->next_rip;
+	/* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+	if (static_cpu_has(X86_FEATURE_NRIPS))
+		vmcb->control.next_rip  = info->next_rip;
 	vmcb->control.exit_code = icpt_info.exit_code;
 	vmexit = nested_svm_exit_handled(svm);
 
@@ -4346,6 +4363,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.hardware_enable = svm_hardware_enable,
 	.hardware_disable = svm_hardware_disable,
 	.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+	.cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
 
 	.vcpu_create = svm_create_vcpu,
 	.vcpu_free = svm_free_vcpu,
@@ -4381,6 +4399,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
+	.fpu_activate = svm_fpu_activate,
 	.fpu_deactivate = svm_fpu_deactivate,
 
 	.tlb_flush = svm_flush_tlb,
@@ -4439,6 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.handle_external_intr = svm_handle_external_intr,
 
 	.sched_in = svm_sched_in,
+
+	.pmu_ops = &amd_pmu_ops,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c7bc8bef21f..4eae7c35ddf5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
 		  __entry->delta < 0 ? "early" : "late")
 );
 
+TRACE_EVENT(kvm_enter_smm,
+	TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+	TP_ARGS(vcpu_id, smbase, entering),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	u64,		smbase		)
+		__field(	bool,		entering	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->smbase		= smbase;
+		__entry->entering	= entering;
+	),
+
+	TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+		  __entry->vcpu_id,
+		  __entry->entering ? "entering" : "leaving",
+		  __entry->smbase)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b61687bd79..ab53d80b0f64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -40,14 +40,14 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
+#include <asm/fpu/internal.h>
 #include <asm/perf_event.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
 #include <asm/apic.h>
 
 #include "trace.h"
+#include "pmu.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 #define __ex_clear(x, reg) \
@@ -786,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 
 static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
 {
-	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+	struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
 	if (is_error_page(page))
 		return NULL;
 
@@ -1883,7 +1883,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 	 * If the FPU is not active (through the host task or
 	 * the guest vcpu), then restore the cr0.TS bit.
 	 */
-	if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+	if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded)
 		stts();
 	load_gdt(this_cpu_ptr(&host_gdt));
 }
@@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
 	if (is_guest_mode(vcpu))
 		msr_bitmap = vmx_msr_bitmap_nested;
-	else if (irqchip_in_kernel(vcpu->kvm) &&
-		apic_x2apic_mode(vcpu->arch.apic)) {
+	else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 		else
@@ -2623,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
-	u64 data;
 	struct shared_msr_entry *msr;
 
-	if (!pdata) {
-		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
-		return -EINVAL;
-	}
-
-	switch (msr_index) {
+	switch (msr_info->index) {
 #ifdef CONFIG_X86_64
 	case MSR_FS_BASE:
-		data = vmcs_readl(GUEST_FS_BASE);
+		msr_info->data = vmcs_readl(GUEST_FS_BASE);
 		break;
 	case MSR_GS_BASE:
-		data = vmcs_readl(GUEST_GS_BASE);
+		msr_info->data = vmcs_readl(GUEST_GS_BASE);
 		break;
 	case MSR_KERNEL_GS_BASE:
 		vmx_load_host_state(to_vmx(vcpu));
-		data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+		msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
 		break;
 #endif
 	case MSR_EFER:
-		return kvm_get_msr_common(vcpu, msr_index, pdata);
+		return kvm_get_msr_common(vcpu, msr_info);
 	case MSR_IA32_TSC:
-		data = guest_read_tsc();
+		msr_info->data = guest_read_tsc();
 		break;
 	case MSR_IA32_SYSENTER_CS:
-		data = vmcs_read32(GUEST_SYSENTER_CS);
+		msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
 		break;
 	case MSR_IA32_SYSENTER_EIP:
-		data = vmcs_readl(GUEST_SYSENTER_EIP);
+		msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
 		break;
 	case MSR_IA32_SYSENTER_ESP:
-		data = vmcs_readl(GUEST_SYSENTER_ESP);
+		msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
 		break;
 	case MSR_IA32_BNDCFGS:
 		if (!vmx_mpx_supported())
 			return 1;
-		data = vmcs_read64(GUEST_BNDCFGS);
+		msr_info->data = vmcs_read64(GUEST_BNDCFGS);
 		break;
 	case MSR_IA32_FEATURE_CONTROL:
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
-		data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+		msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
 		break;
 	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
 		if (!nested_vmx_allowed(vcpu))
 			return 1;
-		return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+		return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
 	case MSR_IA32_XSS:
 		if (!vmx_xsaves_supported())
 			return 1;
-		data = vcpu->arch.ia32_xss;
+		msr_info->data = vcpu->arch.ia32_xss;
 		break;
 	case MSR_TSC_AUX:
 		if (!to_vmx(vcpu)->rdtscp_enabled)
 			return 1;
 		/* Otherwise falls through */
 	default:
-		msr = find_msr_entry(to_vmx(vcpu), msr_index);
+		msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
 		if (msr) {
-			data = msr->data;
+			msr_info->data = msr->data;
 			break;
 		}
-		return kvm_get_msr_common(vcpu, msr_index, pdata);
+		return kvm_get_msr_common(vcpu, msr_info);
 	}
 
-	*pdata = data;
 	return 0;
 }
 
@@ -4123,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	kvm_userspace_mem.flags = 0;
 	kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+	r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
 	if (r)
 		goto out;
 
@@ -4158,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	kvm_userspace_mem.guest_phys_addr =
 		kvm->arch.ept_identity_map_addr;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+	r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
 
 	return r;
 }
@@ -4667,16 +4659,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
-	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
-		u32 msr_low, msr_high;
-		u64 host_pat;
-		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-		host_pat = msr_low | ((u64) msr_high << 32);
-		/* Write the default value follow host pat */
-		vmcs_write64(GUEST_IA32_PAT, host_pat);
-		/* Keep arch.pat sync with GUEST_IA32_PAT */
-		vmx->vcpu.arch.pat = host_pat;
-	}
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 
 	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
 		u32 index = vmx_msr_index[i];
@@ -4708,22 +4692,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	return 0;
 }
 
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct msr_data apic_base_msr;
+	u64 cr0;
 
 	vmx->rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
 
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
-	kvm_set_cr8(&vmx->vcpu, 0);
-	apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
-		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
-	apic_base_msr.host_initiated = true;
-	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+	kvm_set_cr8(vcpu, 0);
+
+	if (!init_event) {
+		apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+				     MSR_IA32_APICBASE_ENABLE;
+		if (kvm_vcpu_is_reset_bsp(vcpu))
+			apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+		apic_base_msr.host_initiated = true;
+		kvm_set_apic_base(vcpu, &apic_base_msr);
+	}
 
 	vmx_segment_cache_clear(vmx);
 
@@ -4747,9 +4736,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
 	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
 
-	vmcs_write32(GUEST_SYSENTER_CS, 0);
-	vmcs_writel(GUEST_SYSENTER_ESP, 0);
-	vmcs_writel(GUEST_SYSENTER_EIP, 0);
+	if (!init_event) {
+		vmcs_write32(GUEST_SYSENTER_CS, 0);
+		vmcs_writel(GUEST_SYSENTER_ESP, 0);
+		vmcs_writel(GUEST_SYSENTER_EIP, 0);
+		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+	}
 
 	vmcs_writel(GUEST_RFLAGS, 0x02);
 	kvm_rip_write(vcpu, 0xfff0);
@@ -4764,18 +4756,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
 	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
 
-	/* Special registers */
-	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
 	setup_msrs(vmx);
 
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
 
-	if (cpu_has_vmx_tpr_shadow()) {
+	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+		if (vm_need_tpr_shadow(vcpu->kvm))
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
-				     __pa(vmx->vcpu.arch.apic->regs));
+				     __pa(vcpu->arch.apic->regs));
 		vmcs_write32(TPR_THRESHOLD, 0);
 	}
 
@@ -4787,12 +4776,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	if (vmx->vpid != 0)
 		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
 
-	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
-	vmx_set_cr4(&vmx->vcpu, 0);
-	vmx_set_efer(&vmx->vcpu, 0);
-	vmx_fpu_activate(&vmx->vcpu);
-	update_exception_bitmap(&vmx->vcpu);
+	cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+	vmx_set_cr0(vcpu, cr0); /* enter rmode */
+	vmx->vcpu.arch.cr0 = cr0;
+	vmx_set_cr4(vcpu, 0);
+	if (!init_event)
+		vmx_set_efer(vcpu, 0);
+	vmx_fpu_activate(vcpu);
+	update_exception_bitmap(vcpu);
 
 	vpid_sync_context(vmx);
 }
@@ -4965,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 		.flags = 0,
 	};
 
-	ret = kvm_set_memory_region(kvm, &tss_mem);
+	ret = x86_set_memory_region(kvm, &tss_mem);
 	if (ret)
 		return ret;
 	kvm->arch.tss_addr = addr;
@@ -5475,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
 static int handle_rdmsr(struct kvm_vcpu *vcpu)
 {
 	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
-	u64 data;
+	struct msr_data msr_info;
 
-	if (vmx_get_msr(vcpu, ecx, &data)) {
+	msr_info.index = ecx;
+	msr_info.host_initiated = false;
+	if (vmx_get_msr(vcpu, &msr_info)) {
 		trace_kvm_msr_read_ex(ecx);
 		kvm_inject_gp(vcpu, 0);
 		return 1;
 	}
 
-	trace_kvm_msr_read(ecx, data);
+	trace_kvm_msr_read(ecx, msr_info.data);
 
 	/* FIXME: handling of bits 32:63 of rax, rdx */
-	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
-	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+	vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+	vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
@@ -5710,9 +5703,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	/* clear all local breakpoint enable flags */
-	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
-
 	/*
 	 * TODO: What about debug traps on tss switch?
 	 *       Are we supposed to inject them and update dr6?
@@ -7333,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 		bitmap += (port & 0x7fff) / 8;
 
 		if (last_bitmap != bitmap)
-			if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
 				return true;
 		if (b & (1 << (port & 7)))
 			return true;
@@ -7377,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	/* Then read the msr_index'th bit from this bitmap: */
 	if (msr_index < 1024*8) {
 		unsigned char b;
-		if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
 			return true;
 		return 1 & (b >> (msr_index & 7));
 	} else
@@ -7642,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx)
 	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 }
 
-static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
 {
-	struct kvm *kvm = vmx->vcpu.kvm;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 *pml_buf;
 	u16 pml_idx;
 
@@ -7666,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
 
 		gpa = pml_buf[pml_idx];
 		WARN_ON(gpa & (PAGE_SIZE - 1));
-		mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+		kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
 	}
 
 	/* reset PML index */
@@ -7691,6 +7681,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
 		kvm_vcpu_kick(vcpu);
 }
 
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+	       name, vmcs_read32(sel),
+	       vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+	       vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+	       vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+	pr_err("%s                           limit=0x%08x, base=0x%016lx\n",
+	       name, vmcs_read32(limit),
+	       vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+	u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+	u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+	u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+	u32 secondary_exec_control = 0;
+	unsigned long cr4 = vmcs_readl(GUEST_CR4);
+	u64 efer = vmcs_readl(GUEST_IA32_EFER);
+	int i, n;
+
+	if (cpu_has_secondary_exec_ctrls())
+		secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	pr_err("*** Guest State ***\n");
+	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+	       vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+	       vmcs_readl(CR0_GUEST_HOST_MASK));
+	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+	       cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+	    (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+	{
+		pr_err("PDPTR0 = 0x%016lx  PDPTR1 = 0x%016lx\n",
+		       vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
+		pr_err("PDPTR2 = 0x%016lx  PDPTR3 = 0x%016lx\n",
+		       vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+	}
+	pr_err("RSP = 0x%016lx  RIP = 0x%016lx\n",
+	       vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+	pr_err("RFLAGS=0x%08lx         DR7 = 0x%016lx\n",
+	       vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+	       vmcs_readl(GUEST_SYSENTER_ESP),
+	       vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+	vmx_dump_sel("CS:  ", GUEST_CS_SELECTOR);
+	vmx_dump_sel("DS:  ", GUEST_DS_SELECTOR);
+	vmx_dump_sel("SS:  ", GUEST_SS_SELECTOR);
+	vmx_dump_sel("ES:  ", GUEST_ES_SELECTOR);
+	vmx_dump_sel("FS:  ", GUEST_FS_SELECTOR);
+	vmx_dump_sel("GS:  ", GUEST_GS_SELECTOR);
+	vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+	vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+	vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+	vmx_dump_sel("TR:  ", GUEST_TR_SELECTOR);
+	if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+	    (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+		pr_err("EFER =     0x%016llx  PAT = 0x%016lx\n",
+		       efer, vmcs_readl(GUEST_IA32_PAT));
+	pr_err("DebugCtl = 0x%016lx  DebugExceptions = 0x%016lx\n",
+	       vmcs_readl(GUEST_IA32_DEBUGCTL),
+	       vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+		pr_err("PerfGlobCtl = 0x%016lx\n",
+		       vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+		pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+	pr_err("Interruptibility = %08x  ActivityState = %08x\n",
+	       vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+	       vmcs_read32(GUEST_ACTIVITY_STATE));
+	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+		pr_err("InterruptStatus = %04x\n",
+		       vmcs_read16(GUEST_INTR_STATUS));
+
+	pr_err("*** Host State ***\n");
+	pr_err("RIP = 0x%016lx  RSP = 0x%016lx\n",
+	       vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+	       vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+	       vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+	       vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+	       vmcs_read16(HOST_TR_SELECTOR));
+	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+	       vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+	       vmcs_readl(HOST_TR_BASE));
+	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+	       vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+	       vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+	       vmcs_readl(HOST_CR4));
+	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+	       vmcs_readl(HOST_IA32_SYSENTER_ESP),
+	       vmcs_read32(HOST_IA32_SYSENTER_CS),
+	       vmcs_readl(HOST_IA32_SYSENTER_EIP));
+	if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+		pr_err("EFER = 0x%016lx  PAT = 0x%016lx\n",
+		       vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		pr_err("PerfGlobCtl = 0x%016lx\n",
+		       vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+
+	pr_err("*** Control State ***\n");
+	pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+	       pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+	pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+	       vmcs_read32(EXCEPTION_BITMAP),
+	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+	       vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+	       vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+	       vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+	       vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+	       vmcs_read32(VM_EXIT_INTR_INFO),
+	       vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+	       vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+	pr_err("        reason=%08x qualification=%016lx\n",
+	       vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+	       vmcs_read32(IDT_VECTORING_INFO_FIELD),
+	       vmcs_read32(IDT_VECTORING_ERROR_CODE));
+	pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+		pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+		pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+		pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+	n = vmcs_read32(CR3_TARGET_COUNT);
+	for (i = 0; i + 1 < n; i += 4)
+		pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+		       i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+	if (i < n)
+		pr_err("CR3 target%u=%016lx\n",
+		       i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+		pr_err("PLE Gap=%08x Window=%08x\n",
+		       vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+		pr_err("Virtual processor ID = 0x%04x\n",
+		       vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
 /*
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
@@ -7709,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	 * flushed already.
 	 */
 	if (enable_pml)
-		vmx_flush_pml_buffer(vmx);
+		vmx_flush_pml_buffer(vcpu);
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required)
@@ -7723,6 +7865,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	}
 
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+		dump_vmcs();
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
 			= exit_reason;
@@ -7996,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 		local_irq_enable();
 }
 
+static bool vmx_has_high_real_mode_segbase(void)
+{
+	return enable_unrestricted_guest || emulate_invalid_guest_state;
+}
+
 static bool vmx_mpx_supported(void)
 {
 	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -8480,7 +8628,8 @@ static int get_ept_level(void)
 
 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
-	u64 ret;
+	u8 cache;
+	u64 ipat = 0;
 
 	/* For VT-d and EPT combination
 	 * 1. MMIO: always map as UC
@@ -8493,16 +8642,27 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
 	 *    consistent with host MTRR
 	 */
-	if (is_mmio)
-		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
-	else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
-		ret = kvm_get_guest_memory_type(vcpu, gfn) <<
-		      VMX_EPT_MT_EPTE_SHIFT;
-	else
-		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
-			| VMX_EPT_IPAT_BIT;
+	if (is_mmio) {
+		cache = MTRR_TYPE_UNCACHABLE;
+		goto exit;
+	}
 
-	return ret;
+	if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+		ipat = VMX_EPT_IPAT_BIT;
+		cache = MTRR_TYPE_WRBACK;
+		goto exit;
+	}
+
+	if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+		ipat = VMX_EPT_IPAT_BIT;
+		cache = MTRR_TYPE_UNCACHABLE;
+		goto exit;
+	}
+
+	cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+
+exit:
+	return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
 }
 
 static int vmx_get_lpage_level(void)
@@ -8924,7 +9084,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
 				       struct vmx_msr_entry *e)
 {
 	/* x2APIC MSR accesses are not allowed */
-	if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+	if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
 		return -EINVAL;
 	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
 	    e->index == MSR_IA32_UCODE_REV)
@@ -8966,8 +9126,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 
 	msr.host_initiated = false;
 	for (i = 0; i < count; i++) {
-		if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
-				   &e, sizeof(e))) {
+		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+					&e, sizeof(e))) {
 			pr_warn_ratelimited(
 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
 				__func__, i, gpa + i * sizeof(e));
@@ -8999,9 +9159,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 	struct vmx_msr_entry e;
 
 	for (i = 0; i < count; i++) {
-		if (kvm_read_guest(vcpu->kvm,
-				   gpa + i * sizeof(e),
-				   &e, 2 * sizeof(u32))) {
+		struct msr_data msr_info;
+		if (kvm_vcpu_read_guest(vcpu,
+					gpa + i * sizeof(e),
+					&e, 2 * sizeof(u32))) {
 			pr_warn_ratelimited(
 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
 				__func__, i, gpa + i * sizeof(e));
@@ -9013,19 +9174,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
 				__func__, i, e.index, e.reserved);
 			return -EINVAL;
 		}
-		if (kvm_get_msr(vcpu, e.index, &e.value)) {
+		msr_info.host_initiated = false;
+		msr_info.index = e.index;
+		if (kvm_get_msr(vcpu, &msr_info)) {
 			pr_warn_ratelimited(
 				"%s cannot read MSR (%u, 0x%x)\n",
 				__func__, i, e.index);
 			return -EINVAL;
 		}
-		if (kvm_write_guest(vcpu->kvm,
-				    gpa + i * sizeof(e) +
-					offsetof(struct vmx_msr_entry, value),
-				    &e.value, sizeof(e.value))) {
+		if (kvm_vcpu_write_guest(vcpu,
+					 gpa + i * sizeof(e) +
+					     offsetof(struct vmx_msr_entry, value),
+					 &msr_info.data, sizeof(msr_info.data))) {
 			pr_warn_ratelimited(
 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
-				__func__, i, e.index, e.value);
+				__func__, i, e.index, msr_info.data);
 			return -EINVAL;
 		}
 	}
@@ -10150,6 +10313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
 	.cpu_has_accelerated_tpr = report_flexpriority,
+	.cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
@@ -10185,6 +10349,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
+	.fpu_activate = vmx_fpu_activate,
 	.fpu_deactivate = vmx_fpu_deactivate,
 
 	.tlb_flush = vmx_flush_tlb,
@@ -10254,6 +10419,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+
+	.pmu_ops = &intel_pmu_ops,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c73efcd03e29..ac165c2fb8e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -28,6 +28,7 @@
 #include "x86.h"
 #include "cpuid.h"
 #include "assigned-dev.h"
+#include "pmu.h"
 
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
@@ -57,11 +58,9 @@
 #include <asm/debugreg.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
-#include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h> /* Ugh! */
-#include <asm/xcr.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 
@@ -99,6 +98,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
 unsigned int min_timer_period_us = 500;
 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
 
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
 bool kvm_has_tsc_control;
 EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
 u32  kvm_max_guest_tsc_khz;
@@ -475,7 +477,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr);
 
 /*
  * This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
+ * running guest. The difference to kvm_vcpu_read_guest_page is that this function
  * can read from guest physical or from the guest's guest physical memory.
  */
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -493,7 +495,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 
 	real_gfn = gpa_to_gfn(real_gfn);
 
-	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
 }
 EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
 
@@ -572,8 +574,7 @@ out:
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
-	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
-				    X86_CR0_CD | X86_CR0_NW;
+	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
 
 	cr0 |= X86_CR0_ET;
 
@@ -619,6 +620,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
+
+	if ((cr0 ^ old_cr0) & X86_CR0_CD)
+		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
@@ -702,8 +707,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
-	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
-				   X86_CR4_PAE | X86_CR4_SMEP;
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+				   X86_CR4_SMEP | X86_CR4_SMAP;
+
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 
@@ -744,9 +750,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
 		kvm_mmu_reset_context(vcpu);
 
-	if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
-		update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
-
 	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
 		kvm_update_cpuid(vcpu);
 
@@ -910,7 +913,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
 	u64 data;
 	int err;
 
-	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
 	if (err)
 		return err;
 	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
@@ -925,17 +928,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
  *
  * This list is modified at module load time to reflect the
  * capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	12
 static u32 msrs_to_save[] = {
-	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
-	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
-	MSR_KVM_PV_EOI_EN,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -947,14 +944,24 @@ static u32 msrs_to_save[] = {
 
 static unsigned num_msrs_to_save;
 
-static const u32 emulated_msrs[] = {
+static u32 emulated_msrs[] = {
+	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+	MSR_KVM_PV_EOI_EN,
+
 	MSR_IA32_TSC_ADJUST,
 	MSR_IA32_TSCDEADLINE,
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MCG_STATUS,
 	MSR_IA32_MCG_CTL,
+	MSR_IA32_SMBASE,
 };
 
+static unsigned num_emulated_msrs;
+
 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 	if (efer & efer_reserved_bits)
@@ -1048,6 +1055,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr);
 /*
  * Adapt set_msr() to msr_io()'s calling convention
  */
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+	struct msr_data msr;
+	int r;
+
+	msr.index = index;
+	msr.host_initiated = true;
+	r = kvm_get_msr(vcpu, &msr);
+	if (r)
+		return r;
+
+	*data = msr.data;
+	return 0;
+}
+
 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 {
 	struct msr_data msr;
@@ -1700,6 +1722,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->pvclock_set_guest_stopped_request = false;
 	}
 
+	pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
+
 	/* If the host uses TSC clocksource, then it is stable */
 	if (use_master_clock)
 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
@@ -1770,127 +1794,14 @@ static void kvmclock_sync_fn(struct work_struct *work)
 					   kvmclock_sync_work);
 	struct kvm *kvm = container_of(ka, struct kvm, arch);
 
+	if (!kvmclock_periodic_sync)
+		return;
+
 	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
 }
 
-static bool msr_mtrr_valid(unsigned msr)
-{
-	switch (msr) {
-	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
-	case MSR_MTRRfix64K_00000:
-	case MSR_MTRRfix16K_80000:
-	case MSR_MTRRfix16K_A0000:
-	case MSR_MTRRfix4K_C0000:
-	case MSR_MTRRfix4K_C8000:
-	case MSR_MTRRfix4K_D0000:
-	case MSR_MTRRfix4K_D8000:
-	case MSR_MTRRfix4K_E0000:
-	case MSR_MTRRfix4K_E8000:
-	case MSR_MTRRfix4K_F0000:
-	case MSR_MTRRfix4K_F8000:
-	case MSR_MTRRdefType:
-	case MSR_IA32_CR_PAT:
-		return true;
-	case 0x2f8:
-		return true;
-	}
-	return false;
-}
-
-static bool valid_pat_type(unsigned t)
-{
-	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
-static bool valid_mtrr_type(unsigned t)
-{
-	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
-}
-
-bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	int i;
-	u64 mask;
-
-	if (!msr_mtrr_valid(msr))
-		return false;
-
-	if (msr == MSR_IA32_CR_PAT) {
-		for (i = 0; i < 8; i++)
-			if (!valid_pat_type((data >> (i * 8)) & 0xff))
-				return false;
-		return true;
-	} else if (msr == MSR_MTRRdefType) {
-		if (data & ~0xcff)
-			return false;
-		return valid_mtrr_type(data & 0xff);
-	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
-		for (i = 0; i < 8 ; i++)
-			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
-				return false;
-		return true;
-	}
-
-	/* variable MTRRs */
-	WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
-
-	mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
-	if ((msr & 1) == 0) {
-		/* MTRR base */
-		if (!valid_mtrr_type(data & 0xff))
-			return false;
-		mask |= 0xf00;
-	} else
-		/* MTRR mask */
-		mask |= 0x7ff;
-	if (data & mask) {
-		kvm_inject_gp(vcpu, 0);
-		return false;
-	}
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
-
-static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
-	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
-	if (!kvm_mtrr_valid(vcpu, msr, data))
-		return 1;
-
-	if (msr == MSR_MTRRdefType) {
-		vcpu->arch.mtrr_state.def_type = data;
-		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
-	} else if (msr == MSR_MTRRfix64K_00000)
-		p[0] = data;
-	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
-		p[1 + msr - MSR_MTRRfix16K_80000] = data;
-	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
-		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
-	else if (msr == MSR_IA32_CR_PAT)
-		vcpu->arch.pat = data;
-	else {	/* Variable MTRRs */
-		int idx, is_mtrr_mask;
-		u64 *pt;
-
-		idx = (msr - 0x200) / 2;
-		is_mtrr_mask = msr - 0x200 - 2 * idx;
-		if (!is_mtrr_mask)
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
-		else
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
-		*pt = data;
-	}
-
-	kvm_mmu_reset_context(vcpu);
-	return 0;
-}
-
 static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -1949,7 +1860,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
 		r = PTR_ERR(page);
 		goto out;
 	}
-	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
 		goto out_free;
 	r = 0;
 out_free:
@@ -2049,13 +1960,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			break;
 		}
 		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
-		addr = gfn_to_hva(vcpu->kvm, gfn);
+		addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
 		if (kvm_is_error_hva(addr))
 			return 1;
 		if (__clear_user((void __user *)addr, PAGE_SIZE))
 			return 1;
 		vcpu->arch.hv_vapic = data;
-		mark_page_dirty(vcpu->kvm, gfn);
+		kvm_vcpu_mark_page_dirty(vcpu, gfn);
 		if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
 			return 1;
 		break;
@@ -2182,7 +2093,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			    __func__, data);
 		break;
 	case 0x200 ... 0x2ff:
-		return set_msr_mtrr(vcpu, msr, data);
+		return kvm_mtrr_set_msr(vcpu, msr, data);
 	case MSR_IA32_APICBASE:
 		return kvm_set_apic_base(vcpu, msr_info);
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
@@ -2202,6 +2113,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_IA32_SMBASE:
+		if (!msr_info->host_initiated)
+			return 1;
+		vcpu->arch.smbase = data;
+		break;
 	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
@@ -2222,6 +2138,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 					&vcpu->requests);
 
 			ka->boot_vcpu_runs_old_kvmclock = tmp;
+
+			ka->kvmclock_offset = -get_kernel_ns();
 		}
 
 		vcpu->arch.time = data;
@@ -2283,37 +2201,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
 		return set_msr_mce(vcpu, msr, data);
 
-	/* Performance counters are not protected by a CPUID bit,
-	 * so we should check all of them in the generic path for the sake of
-	 * cross vendor migration.
-	 * Writing a zero into the event select MSRs disables them,
-	 * which we perfectly emulate ;-). Any other value should be at least
-	 * reported, some guests depend on them.
-	 */
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_EVNTSEL1:
-	case MSR_K7_EVNTSEL2:
-	case MSR_K7_EVNTSEL3:
-		if (data != 0)
-			vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-				    "0x%x data 0x%llx\n", msr, data);
-		break;
-	/* at least RHEL 4 unconditionally writes to the perfctr registers,
-	 * so we ignore writes to make it happy.
-	 */
-	case MSR_K7_PERFCTR0:
-	case MSR_K7_PERFCTR1:
-	case MSR_K7_PERFCTR2:
-	case MSR_K7_PERFCTR3:
-		vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
-			    "0x%x data 0x%llx\n", msr, data);
-		break;
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-		pr = true;
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
-		if (kvm_pmu_msr(vcpu, msr))
+	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+		pr = true; /* fall through */
+	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
 
 		if (pr || data != 0)
@@ -2359,7 +2252,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	default:
 		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
 			return xen_hvm_config(vcpu, data);
-		if (kvm_pmu_msr(vcpu, msr))
+		if (kvm_pmu_is_valid_msr(vcpu, msr))
 			return kvm_pmu_set_msr(vcpu, msr_info);
 		if (!ignore_msrs) {
 			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
@@ -2381,48 +2274,12 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
-	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+	return kvm_x86_ops->get_msr(vcpu, msr);
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr);
 
-static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
-	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
-	if (!msr_mtrr_valid(msr))
-		return 1;
-
-	if (msr == MSR_MTRRdefType)
-		*pdata = vcpu->arch.mtrr_state.def_type +
-			 (vcpu->arch.mtrr_state.enabled << 10);
-	else if (msr == MSR_MTRRfix64K_00000)
-		*pdata = p[0];
-	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
-		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
-	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
-		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
-	else if (msr == MSR_IA32_CR_PAT)
-		*pdata = vcpu->arch.pat;
-	else {	/* Variable MTRRs */
-		int idx, is_mtrr_mask;
-		u64 *pt;
-
-		idx = (msr - 0x200) / 2;
-		is_mtrr_mask = msr - 0x200 - 2 * idx;
-		if (!is_mtrr_mask)
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
-		else
-			pt =
-			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
-		*pdata = *pt;
-	}
-
-	return 0;
-}
-
 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
 	u64 data;
@@ -2520,11 +2377,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	return 0;
 }
 
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	u64 data;
 
-	switch (msr) {
+	switch (msr_info->index) {
 	case MSR_IA32_PLATFORM_ID:
 	case MSR_IA32_EBL_CR_POWERON:
 	case MSR_IA32_DEBUGCTLMSR:
@@ -2535,38 +2392,28 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_K8_SYSCFG:
 	case MSR_K7_HWCR:
 	case MSR_VM_HSAVE_PA:
-	case MSR_K7_EVNTSEL0:
-	case MSR_K7_EVNTSEL1:
-	case MSR_K7_EVNTSEL2:
-	case MSR_K7_EVNTSEL3:
-	case MSR_K7_PERFCTR0:
-	case MSR_K7_PERFCTR1:
-	case MSR_K7_PERFCTR2:
-	case MSR_K7_PERFCTR3:
 	case MSR_K8_INT_PENDING_MSG:
 	case MSR_AMD64_NB_CFG:
 	case MSR_FAM10H_MMIO_CONF_BASE:
 	case MSR_AMD64_BU_CFG2:
-		data = 0;
+		msr_info->data = 0;
 		break;
-	case MSR_P6_PERFCTR0:
-	case MSR_P6_PERFCTR1:
-	case MSR_P6_EVNTSEL0:
-	case MSR_P6_EVNTSEL1:
-		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_get_msr(vcpu, msr, pdata);
-		data = 0;
+	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+		msr_info->data = 0;
 		break;
 	case MSR_IA32_UCODE_REV:
-		data = 0x100000000ULL;
+		msr_info->data = 0x100000000ULL;
 		break;
 	case MSR_MTRRcap:
-		data = 0x500 | KVM_NR_VAR_MTRR;
-		break;
 	case 0x200 ... 0x2ff:
-		return get_msr_mtrr(vcpu, msr, pdata);
+		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
 	case 0xcd: /* fsb frequency */
-		data = 3;
+		msr_info->data = 3;
 		break;
 		/*
 		 * MSR_EBC_FREQUENCY_ID
@@ -2580,48 +2427,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 * multiplying by zero otherwise.
 		 */
 	case MSR_EBC_FREQUENCY_ID:
-		data = 1 << 24;
+		msr_info->data = 1 << 24;
 		break;
 	case MSR_IA32_APICBASE:
-		data = kvm_get_apic_base(vcpu);
+		msr_info->data = kvm_get_apic_base(vcpu);
 		break;
 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
-		return kvm_x2apic_msr_read(vcpu, msr, pdata);
+		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
 		break;
 	case MSR_IA32_TSCDEADLINE:
-		data = kvm_get_lapic_tscdeadline_msr(vcpu);
+		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
 		break;
 	case MSR_IA32_TSC_ADJUST:
-		data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
 		break;
 	case MSR_IA32_MISC_ENABLE:
-		data = vcpu->arch.ia32_misc_enable_msr;
+		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+		break;
+	case MSR_IA32_SMBASE:
+		if (!msr_info->host_initiated)
+			return 1;
+		msr_info->data = vcpu->arch.smbase;
 		break;
 	case MSR_IA32_PERF_STATUS:
 		/* TSC increment by tick */
-		data = 1000ULL;
+		msr_info->data = 1000ULL;
 		/* CPU multiplier */
 		data |= (((uint64_t)4ULL) << 40);
 		break;
 	case MSR_EFER:
-		data = vcpu->arch.efer;
+		msr_info->data = vcpu->arch.efer;
 		break;
 	case MSR_KVM_WALL_CLOCK:
 	case MSR_KVM_WALL_CLOCK_NEW:
-		data = vcpu->kvm->arch.wall_clock;
+		msr_info->data = vcpu->kvm->arch.wall_clock;
 		break;
 	case MSR_KVM_SYSTEM_TIME:
 	case MSR_KVM_SYSTEM_TIME_NEW:
-		data = vcpu->arch.time;
+		msr_info->data = vcpu->arch.time;
 		break;
 	case MSR_KVM_ASYNC_PF_EN:
-		data = vcpu->arch.apf.msr_val;
+		msr_info->data = vcpu->arch.apf.msr_val;
 		break;
 	case MSR_KVM_STEAL_TIME:
-		data = vcpu->arch.st.msr_val;
+		msr_info->data = vcpu->arch.st.msr_val;
 		break;
 	case MSR_KVM_PV_EOI_EN:
-		data = vcpu->arch.pv_eoi.msr_val;
+		msr_info->data = vcpu->arch.pv_eoi.msr_val;
 		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
@@ -2629,7 +2481,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
-		return get_msr_mce(vcpu, msr, pdata);
+		return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
 	case MSR_K7_CLK_CTL:
 		/*
 		 * Provide expected ramp-up count for K7. All other
@@ -2640,17 +2492,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 * type 6, model 8 and higher from exploding due to
 		 * the rdmsr failing.
 		 */
-		data = 0x20000000;
+		msr_info->data = 0x20000000;
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
-		if (kvm_hv_msr_partition_wide(msr)) {
+		if (kvm_hv_msr_partition_wide(msr_info->index)) {
 			int r;
 			mutex_lock(&vcpu->kvm->lock);
-			r = get_msr_hyperv_pw(vcpu, msr, pdata);
+			r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data);
 			mutex_unlock(&vcpu->kvm->lock);
 			return r;
 		} else
-			return get_msr_hyperv(vcpu, msr, pdata);
+			return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data);
 		break;
 	case MSR_IA32_BBL_CR_CTL3:
 		/* This legacy MSR exists but isn't fully documented in current
@@ -2663,31 +2515,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		 * L2 cache control register 3: 64GB range, 256KB size,
 		 * enabled, latency 0x1, configured
 		 */
-		data = 0xbe702111;
+		msr_info->data = 0xbe702111;
 		break;
 	case MSR_AMD64_OSVW_ID_LENGTH:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
-		data = vcpu->arch.osvw.length;
+		msr_info->data = vcpu->arch.osvw.length;
 		break;
 	case MSR_AMD64_OSVW_STATUS:
 		if (!guest_cpuid_has_osvw(vcpu))
 			return 1;
-		data = vcpu->arch.osvw.status;
+		msr_info->data = vcpu->arch.osvw.status;
 		break;
 	default:
-		if (kvm_pmu_msr(vcpu, msr))
-			return kvm_pmu_get_msr(vcpu, msr, pdata);
+		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+			return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
 		if (!ignore_msrs) {
-			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
 			return 1;
 		} else {
-			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
-			data = 0;
+			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
+			msr_info->data = 0;
 		}
 		break;
 	}
-	*pdata = data;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
@@ -2800,12 +2651,25 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV_TIME:
 	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
 	case KVM_CAP_TSC_DEADLINE_TIMER:
+	case KVM_CAP_ENABLE_CAP_VM:
+	case KVM_CAP_DISABLE_QUIRKS:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
 #endif
 		r = 1;
 		break;
+	case KVM_CAP_X86_SMM:
+		/* SMBASE is usually relocated above 1M on modern chipsets,
+		 * and SMM handlers might indeed rely on 4G segment limits,
+		 * so do not report SMM to be available if real mode is
+		 * emulated via vm86 mode.  Still, do not go to great lengths
+		 * to avoid userspace's usage of the feature, because it is a
+		 * fringe case that is not enabled except via specific settings
+		 * of the module parameters.
+		 */
+		r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
@@ -2862,7 +2726,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
 			goto out;
 		n = msr_list.nmsrs;
-		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+		msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
 		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
 			goto out;
 		r = -E2BIG;
@@ -2874,7 +2738,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
 			goto out;
 		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
 				 &emulated_msrs,
-				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+				 num_emulated_msrs * sizeof(u32)))
 			goto out;
 		r = 0;
 		break;
@@ -3018,6 +2882,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+	kvm_make_request(KVM_REQ_SMI, vcpu);
+
+	return 0;
+}
+
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 					   struct kvm_tpr_access_ctl *tac)
 {
@@ -3123,8 +2994,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 
 	events->sipi_vector = 0; /* never valid when reporting to user space */
 
+	events->smi.smm = is_smm(vcpu);
+	events->smi.pending = vcpu->arch.smi_pending;
+	events->smi.smm_inside_nmi =
+		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SHADOW);
+			 | KVM_VCPUEVENT_VALID_SHADOW
+			 | KVM_VCPUEVENT_VALID_SMM);
 	memset(&events->reserved, 0, sizeof(events->reserved));
 }
 
@@ -3133,7 +3011,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
-			      | KVM_VCPUEVENT_VALID_SHADOW))
+			      | KVM_VCPUEVENT_VALID_SHADOW
+			      | KVM_VCPUEVENT_VALID_SMM))
 		return -EINVAL;
 
 	process_nmi(vcpu);
@@ -3158,6 +3037,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	    kvm_vcpu_has_lapic(vcpu))
 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
 
+	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+		if (events->smi.smm)
+			vcpu->arch.hflags |= HF_SMM_MASK;
+		else
+			vcpu->arch.hflags &= ~HF_SMM_MASK;
+		vcpu->arch.smi_pending = events->smi.pending;
+		if (events->smi.smm_inside_nmi)
+			vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+		else
+			vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+		if (kvm_vcpu_has_lapic(vcpu)) {
+			if (events->smi.latched_init)
+				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+			else
+				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+		}
+	}
+
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 	return 0;
@@ -3196,8 +3093,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 
 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 {
-	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
-	u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+	u64 xstate_bv = xsave->header.xfeatures;
 	u64 valid;
 
 	/*
@@ -3232,7 +3129,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
 
 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 {
-	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+	struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
 	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
 	u64 valid;
 
@@ -3243,9 +3140,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
 	memcpy(xsave, src, XSAVE_HDR_OFFSET);
 
 	/* Set XSTATE_BV and possibly XCOMP_BV.  */
-	xsave->xsave_hdr.xstate_bv = xstate_bv;
+	xsave->header.xfeatures = xstate_bv;
 	if (cpu_has_xsaves)
-		xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+		xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
 	 * Copy each region from the non-compacted offset to the
@@ -3277,8 +3174,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
 		fill_xsave((u8 *) guest_xsave->region, vcpu);
 	} else {
 		memcpy(guest_xsave->region,
-			&vcpu->arch.guest_fpu.state->fxsave,
-			sizeof(struct i387_fxsave_struct));
+			&vcpu->arch.guest_fpu.state.fxsave,
+			sizeof(struct fxregs_state));
 		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
 			XSTATE_FPSSE;
 	}
@@ -3302,8 +3199,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
 	} else {
 		if (xstate_bv & ~XSTATE_FPSSE)
 			return -EINVAL;
-		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
-			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+		memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+			guest_xsave->region, sizeof(struct fxregs_state));
 	}
 	return 0;
 }
@@ -3417,6 +3314,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_nmi(vcpu);
 		break;
 	}
+	case KVM_SMI: {
+		r = kvm_vcpu_ioctl_smi(vcpu);
+		break;
+	}
 	case KVM_SET_CPUID: {
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
@@ -3456,7 +3357,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_MSRS:
-		r = msr_io(vcpu, argp, kvm_get_msr, 1);
+		r = msr_io(vcpu, argp, do_get_msr, 1);
 		break;
 	case KVM_SET_MSRS:
 		r = msr_io(vcpu, argp, do_set_msr, 0);
@@ -3847,6 +3748,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
 	return 0;
 }
 
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+				   struct kvm_enable_cap *cap)
+{
+	int r;
+
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_DISABLE_QUIRKS:
+		kvm->arch.disabled_quirks = cap->args[0];
+		r = 0;
+		break;
+	default:
+		r = -EINVAL;
+		break;
+	}
+	return r;
+}
+
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg)
 {
@@ -4099,7 +4020,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_ENABLE_CAP: {
+		struct kvm_enable_cap cap;
 
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+		break;
+	}
 	default:
 		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
 	}
@@ -4112,8 +4041,7 @@ static void kvm_init_msr_list(void)
 	u32 dummy[2];
 	unsigned i, j;
 
-	/* skip the first msrs in the list. KVM-specific */
-	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
 		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
 			continue;
 
@@ -4138,6 +4066,22 @@ static void kvm_init_msr_list(void)
 		j++;
 	}
 	num_msrs_to_save = j;
+
+	for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+		switch (emulated_msrs[i]) {
+		case MSR_IA32_SMBASE:
+			if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		if (j < i)
+			emulated_msrs[j] = emulated_msrs[i];
+		j++;
+	}
+	num_emulated_msrs = j;
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -4255,8 +4199,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 
 		if (gpa == UNMAPPED_GVA)
 			return X86EMUL_PROPAGATE_FAULT;
-		ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
-					  offset, toread);
+		ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+					       offset, toread);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
 			goto out;
@@ -4289,8 +4233,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 	offset = addr & (PAGE_SIZE-1);
 	if (WARN_ON(offset + bytes > PAGE_SIZE))
 		bytes = (unsigned)PAGE_SIZE - offset;
-	ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
-				  offset, bytes);
+	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+				       offset, bytes);
 	if (unlikely(ret < 0))
 		return X86EMUL_IO_NEEDED;
 
@@ -4336,7 +4280,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 
 		if (gpa == UNMAPPED_GVA)
 			return X86EMUL_PROPAGATE_FAULT;
-		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+		ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
 		if (ret < 0) {
 			r = X86EMUL_IO_NEEDED;
 			goto out;
@@ -4389,7 +4333,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	int ret;
 
-	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
 	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
@@ -4423,7 +4367,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 			void *val, int bytes)
 {
-	return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
 }
 
 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4621,7 +4565,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
 		goto emul_write;
 
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto emul_write;
 
@@ -4649,7 +4593,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
-	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
+	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
 	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
 
 	return X86EMUL_CONTINUE;
@@ -4948,7 +4892,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
 			    u32 msr_index, u64 *pdata)
 {
-	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+	struct msr_data msr;
+	int r;
+
+	msr.index = msr_index;
+	msr.host_initiated = false;
+	r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+	if (r)
+		return r;
+
+	*pdata = msr.data;
+	return 0;
 }
 
 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
@@ -4962,16 +4916,30 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
 	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
 }
 
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+	vcpu->arch.smbase = smbase;
+}
+
 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
 			      u32 pmc)
 {
-	return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+	return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
 }
 
 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
 			     u32 pmc, u64 *pdata)
 {
-	return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
 }
 
 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
@@ -5047,6 +5015,8 @@ static const struct x86_emulate_ops emulate_ops = {
 	.cpl                 = emulator_get_cpl,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
+	.get_smbase          = emulator_get_smbase,
+	.set_smbase          = emulator_set_smbase,
 	.set_msr             = emulator_set_msr,
 	.get_msr             = emulator_get_msr,
 	.check_pmc	     = emulator_check_pmc,
@@ -5108,7 +5078,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
 		     cs_db				? X86EMUL_MODE_PROT32 :
 							  X86EMUL_MODE_PROT16;
-	ctxt->guest_mode = is_guest_mode(vcpu);
+	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+	ctxt->emul_flags = vcpu->arch.hflags;
 
 	init_decode_cache(ctxt);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5277,6 +5250,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
 
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+		/* This is a good place to trace that we are exiting SMM.  */
+		trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
+		if (unlikely(vcpu->arch.smi_pending)) {
+			kvm_make_request(KVM_REQ_SMI, vcpu);
+			vcpu->arch.smi_pending = 0;
+		} else {
+			/* Process a latched INIT, if any.  */
+			kvm_make_request(KVM_REQ_EVENT, vcpu);
+		}
+	}
+
+	kvm_mmu_reset_context(vcpu);
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+	unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+	vcpu->arch.hflags = emul_flags;
+
+	if (changed & HF_SMM_MASK)
+		kvm_smm_changed(vcpu);
+}
+
 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
 				unsigned long *db)
 {
@@ -5476,6 +5477,8 @@ restart:
 		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
 		toggle_interruptibility(vcpu, ctxt->interruptibility);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+		if (vcpu->arch.hflags != ctxt->emul_flags)
+			kvm_set_hflags(vcpu, ctxt->emul_flags);
 		kvm_rip_write(vcpu, ctxt->eip);
 		if (r == EMULATE_DONE)
 			kvm_vcpu_check_singlestep(vcpu, rflags, &r);
@@ -5954,6 +5957,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 	lapic_irq.shorthand = 0;
 	lapic_irq.dest_mode = 0;
 	lapic_irq.dest_id = apicid;
+	lapic_irq.msi_redir_hint = false;
 
 	lapic_irq.delivery_mode = APIC_DM_REMRD;
 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6041,6 +6045,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 
 	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
 	if (irqchip_in_kernel(vcpu->kvm))
@@ -6164,6 +6169,233 @@ static void process_nmi(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 
+#define put_smstate(type, buf, offset, val)			  \
+	*(type *)((buf) + (offset) - 0x7e00) = val
+
+static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+{
+	u32 flags = 0;
+	flags |= seg->g       << 23;
+	flags |= seg->db      << 22;
+	flags |= seg->l       << 21;
+	flags |= seg->avl     << 20;
+	flags |= seg->present << 15;
+	flags |= seg->dpl     << 13;
+	flags |= seg->s       << 12;
+	flags |= seg->type    << 8;
+	return flags;
+}
+
+static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+	struct kvm_segment seg;
+	int offset;
+
+	kvm_get_segment(vcpu, &seg, n);
+	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+	if (n < 3)
+		offset = 0x7f84 + n * 12;
+	else
+		offset = 0x7f2c + (n - 3) * 12;
+
+	put_smstate(u32, buf, offset + 8, seg.base);
+	put_smstate(u32, buf, offset + 4, seg.limit);
+	put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+}
+
+static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+	struct kvm_segment seg;
+	int offset;
+	u16 flags;
+
+	kvm_get_segment(vcpu, &seg, n);
+	offset = 0x7e00 + n * 16;
+
+	flags = process_smi_get_segment_flags(&seg) >> 8;
+	put_smstate(u16, buf, offset, seg.selector);
+	put_smstate(u16, buf, offset + 2, flags);
+	put_smstate(u32, buf, offset + 4, seg.limit);
+	put_smstate(u64, buf, offset + 8, seg.base);
+}
+
+static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+	struct desc_ptr dt;
+	struct kvm_segment seg;
+	unsigned long val;
+	int i;
+
+	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+	for (i = 0; i < 8; i++)
+		put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+	kvm_get_dr(vcpu, 6, &val);
+	put_smstate(u32, buf, 0x7fcc, (u32)val);
+	kvm_get_dr(vcpu, 7, &val);
+	put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+	put_smstate(u32, buf, 0x7fc4, seg.selector);
+	put_smstate(u32, buf, 0x7f64, seg.base);
+	put_smstate(u32, buf, 0x7f60, seg.limit);
+	put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+	put_smstate(u32, buf, 0x7fc0, seg.selector);
+	put_smstate(u32, buf, 0x7f80, seg.base);
+	put_smstate(u32, buf, 0x7f7c, seg.limit);
+	put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7f74, dt.address);
+	put_smstate(u32, buf, 0x7f70, dt.size);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7f58, dt.address);
+	put_smstate(u32, buf, 0x7f54, dt.size);
+
+	for (i = 0; i < 6; i++)
+		process_smi_save_seg_32(vcpu, buf, i);
+
+	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+	/* revision id */
+	put_smstate(u32, buf, 0x7efc, 0x00020000);
+	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+#ifdef CONFIG_X86_64
+	struct desc_ptr dt;
+	struct kvm_segment seg;
+	unsigned long val;
+	int i;
+
+	for (i = 0; i < 16; i++)
+		put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+	kvm_get_dr(vcpu, 6, &val);
+	put_smstate(u64, buf, 0x7f68, val);
+	kvm_get_dr(vcpu, 7, &val);
+	put_smstate(u64, buf, 0x7f60, val);
+
+	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+	/* revision id */
+	put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+	put_smstate(u16, buf, 0x7e90, seg.selector);
+	put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+	put_smstate(u32, buf, 0x7e94, seg.limit);
+	put_smstate(u64, buf, 0x7e98, seg.base);
+
+	kvm_x86_ops->get_idt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7e84, dt.size);
+	put_smstate(u64, buf, 0x7e88, dt.address);
+
+	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+	put_smstate(u16, buf, 0x7e70, seg.selector);
+	put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+	put_smstate(u32, buf, 0x7e74, seg.limit);
+	put_smstate(u64, buf, 0x7e78, seg.base);
+
+	kvm_x86_ops->get_gdt(vcpu, &dt);
+	put_smstate(u32, buf, 0x7e64, dt.size);
+	put_smstate(u64, buf, 0x7e68, dt.address);
+
+	for (i = 0; i < 6; i++)
+		process_smi_save_seg_64(vcpu, buf, i);
+#else
+	WARN_ON_ONCE(1);
+#endif
+}
+
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs, ds;
+	char buf[512];
+	u32 cr0;
+
+	if (is_smm(vcpu)) {
+		vcpu->arch.smi_pending = true;
+		return;
+	}
+
+	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+	vcpu->arch.hflags |= HF_SMM_MASK;
+	memset(buf, 0, 512);
+	if (guest_cpuid_has_longmode(vcpu))
+		process_smi_save_state_64(vcpu, buf);
+	else
+		process_smi_save_state_32(vcpu, buf);
+
+	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+
+	if (kvm_x86_ops->get_nmi_mask(vcpu))
+		vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+	else
+		kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+	kvm_rip_write(vcpu, 0x8000);
+
+	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+	kvm_x86_ops->set_cr0(vcpu, cr0);
+	vcpu->arch.cr0 = cr0;
+
+	kvm_x86_ops->set_cr4(vcpu, 0);
+
+	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+	cs.base = vcpu->arch.smbase;
+
+	ds.selector = 0;
+	ds.base = 0;
+
+	cs.limit    = ds.limit = 0xffffffff;
+	cs.type     = ds.type = 0x3;
+	cs.dpl      = ds.dpl = 0;
+	cs.db       = ds.db = 0;
+	cs.s        = ds.s = 1;
+	cs.l        = ds.l = 0;
+	cs.g        = ds.g = 1;
+	cs.avl      = ds.avl = 0;
+	cs.present  = ds.present = 1;
+	cs.unusable = ds.unusable = 0;
+	cs.padding  = ds.padding = 0;
+
+	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+	if (guest_cpuid_has_longmode(vcpu))
+		kvm_x86_ops->set_efer(vcpu, 0);
+
+	kvm_update_cpuid(vcpu);
+	kvm_mmu_reset_context(vcpu);
+}
+
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
@@ -6197,6 +6429,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 		return;
 
 	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+	if (is_error_page(page))
+		return;
 	kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
 
 	/*
@@ -6270,12 +6504,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
 			record_steal_time(vcpu);
+		if (kvm_check_request(KVM_REQ_SMI, vcpu))
+			process_smi(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
-			kvm_handle_pmu_event(vcpu);
+			kvm_pmu_handle_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
-			kvm_deliver_pmi(vcpu);
+			kvm_pmu_deliver_pmi(vcpu);
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6347,7 +6583,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (req_immediate_exit)
 		smp_send_reschedule(vcpu->cpu);
 
-	kvm_guest_enter();
+	__kvm_guest_enter();
 
 	if (unlikely(vcpu->arch.switch_db_regs)) {
 		set_debugreg(0, 7);
@@ -6597,11 +6833,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct fpu *fpu = &current->thread.fpu;
 	int r;
 	sigset_t sigsaved;
 
-	if (!tsk_used_math(current) && init_fpu(current))
-		return -ENOMEM;
+	fpu__activate_curr(fpu);
 
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -6971,8 +7207,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct i387_fxsave_struct *fxsave =
-			&vcpu->arch.guest_fpu.state->fxsave;
+	struct fxregs_state *fxsave =
+			&vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
@@ -6988,8 +7224,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct i387_fxsave_struct *fxsave =
-			&vcpu->arch.guest_fpu.state->fxsave;
+	struct fxregs_state *fxsave =
+			&vcpu->arch.guest_fpu.state.fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -7003,17 +7239,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
-int fx_init(struct kvm_vcpu *vcpu)
+static void fx_init(struct kvm_vcpu *vcpu)
 {
-	int err;
-
-	err = fpu_alloc(&vcpu->arch.guest_fpu);
-	if (err)
-		return err;
-
-	fpu_finit(&vcpu->arch.guest_fpu);
+	fpstate_init(&vcpu->arch.guest_fpu.state);
 	if (cpu_has_xsaves)
-		vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+		vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
 			host_xcr0 | XSTATE_COMPACTION_ENABLED;
 
 	/*
@@ -7022,14 +7252,6 @@ int fx_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.xcr0 = XSTATE_FP;
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-static void fx_free(struct kvm_vcpu *vcpu)
-{
-	fpu_free(&vcpu->arch.guest_fpu);
 }
 
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
@@ -7045,7 +7267,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	__kernel_fpu_begin();
-	fpu_restore_checking(&vcpu->arch.guest_fpu);
+	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
 	trace_kvm_fpu(1);
 }
 
@@ -7053,14 +7275,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	kvm_put_guest_xcr0(vcpu);
 
-	if (!vcpu->guest_fpu_loaded)
+	if (!vcpu->guest_fpu_loaded) {
+		vcpu->fpu_counter = 0;
 		return;
+	}
 
 	vcpu->guest_fpu_loaded = 0;
-	fpu_save_init(&vcpu->arch.guest_fpu);
+	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
 	__kernel_fpu_end();
 	++vcpu->stat.fpu_reload;
-	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+	/*
+	 * If using eager FPU mode, or if the guest is a frequent user
+	 * of the FPU, just leave the FPU active for next time.
+	 * Every 255 times fpu_counter rolls over to 0; a guest that uses
+	 * the FPU in bursts will revert to loading it on demand.
+	 */
+	if (!vcpu->arch.eager_fpu) {
+		if (++vcpu->fpu_counter < 5)
+			kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+	}
 	trace_kvm_fpu(0);
 }
 
@@ -7069,32 +7302,40 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	kvmclock_reset(vcpu);
 
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
-	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 						unsigned int id)
 {
+	struct kvm_vcpu *vcpu;
+
 	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
 		printk_once(KERN_WARNING
 		"kvm: SMP vm created on host with unstable TSC; "
 		"guest TSC will not be reliable\n");
-	return kvm_x86_ops->vcpu_create(kvm, id);
+
+	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+	/*
+	 * Activate fpu unconditionally in case the guest needs eager FPU.  It will be
+	 * deactivated soon if it doesn't.
+	 */
+	kvm_x86_ops->fpu_activate(vcpu);
+	return vcpu;
 }
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	vcpu->arch.mtrr_state.have_fixed = 1;
+	kvm_vcpu_mtrr_init(vcpu);
 	r = vcpu_load(vcpu);
 	if (r)
 		return r;
-	kvm_vcpu_reset(vcpu);
+	kvm_vcpu_reset(vcpu, false);
 	kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
-
 	return r;
 }
 
@@ -7111,6 +7352,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 	kvm_write_tsc(vcpu, &msr);
 	vcpu_put(vcpu);
 
+	if (!kvmclock_periodic_sync)
+		return;
+
 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
 					KVMCLOCK_SYNC_PERIOD);
 }
@@ -7125,12 +7369,13 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
-	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 {
+	vcpu->arch.hflags = 0;
+
 	atomic_set(&vcpu->arch.nmi_queued, 0);
 	vcpu->arch.nmi_pending = 0;
 	vcpu->arch.nmi_injected = false;
@@ -7156,13 +7401,16 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	vcpu->arch.apf.halted = false;
 
-	kvm_pmu_reset(vcpu);
+	if (!init_event) {
+		kvm_pmu_reset(vcpu);
+		vcpu->arch.smbase = 0x30000;
+	}
 
 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
 	vcpu->arch.regs_avail = ~0;
 	vcpu->arch.regs_dirty = ~0;
 
-	kvm_x86_ops->vcpu_reset(vcpu);
+	kvm_x86_ops->vcpu_reset(vcpu, init_event);
 }
 
 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7351,9 +7599,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		goto fail_free_mce_banks;
 	}
 
-	r = fx_init(vcpu);
-	if (r)
-		goto fail_free_wbinvd_dirty_mask;
+	fx_init(vcpu);
 
 	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
 	vcpu->arch.pv_time_enabled = false;
@@ -7363,12 +7609,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
 
+	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
 	return 0;
-fail_free_wbinvd_dirty_mask:
-	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+
 fail_free_mce_banks:
 	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
@@ -7470,6 +7717,40 @@ void kvm_arch_sync_events(struct kvm *kvm)
 	kvm_free_pit(kvm);
 }
 
+int __x86_set_memory_region(struct kvm *kvm,
+			    const struct kvm_userspace_memory_region *mem)
+{
+	int i, r;
+
+	/* Called with kvm->slots_lock held.  */
+	BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM);
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		struct kvm_userspace_memory_region m = *mem;
+
+		m.slot |= i << 16;
+		r = __kvm_set_memory_region(kvm, &m);
+		if (r < 0)
+			return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+int x86_set_memory_region(struct kvm *kvm,
+			  const struct kvm_userspace_memory_region *mem)
+{
+	int r;
+
+	mutex_lock(&kvm->slots_lock);
+	r = __x86_set_memory_region(kvm, mem);
+	mutex_unlock(&kvm->slots_lock);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(x86_set_memory_region);
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	if (current->mm == kvm->mm) {
@@ -7481,13 +7762,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 		struct kvm_userspace_memory_region mem;
 		memset(&mem, 0, sizeof(mem));
 		mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
-		kvm_set_memory_region(kvm, &mem);
+		x86_set_memory_region(kvm, &mem);
 
 		mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
-		kvm_set_memory_region(kvm, &mem);
+		x86_set_memory_region(kvm, &mem);
 
 		mem.slot = TSS_PRIVATE_MEMSLOT;
-		kvm_set_memory_region(kvm, &mem);
+		x86_set_memory_region(kvm, &mem);
 	}
 	kvm_iommu_unmap_guest(kvm);
 	kfree(kvm->arch.vpic);
@@ -7576,18 +7857,18 @@ out_free:
 	return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
 {
 	/*
 	 * memslots->generation has been incremented.
 	 * mmio generation may have reached its maximum value.
 	 */
-	kvm_mmu_invalidate_mmio_sptes(kvm);
+	kvm_mmu_invalidate_mmio_sptes(kvm, slots);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				enum kvm_mr_change change)
 {
 	/*
@@ -7665,14 +7946,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-				struct kvm_userspace_memory_region *mem,
+				const struct kvm_userspace_memory_region *mem,
 				const struct kvm_memory_slot *old,
+				const struct kvm_memory_slot *new,
 				enum kvm_mr_change change)
 {
-	struct kvm_memory_slot *new;
 	int nr_mmu_pages = 0;
 
-	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+	if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
 		int ret;
 
 		ret = vm_munmap(old->userspace_addr,
@@ -7689,9 +7970,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	if (nr_mmu_pages)
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 
-	/* It's OK to get 'new' slot here as it has already been installed */
-	new = id_to_memslot(kvm->memslots, mem->slot);
-
 	/*
 	 * Dirty logging tracks sptes in 4k granularity, meaning that large
 	 * sptes have to be split.  If live migration is successful, the guest
@@ -7716,9 +7994,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * been zapped so no dirty logging staff is needed for old slot. For
 	 * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
 	 * new and it's also covered when dealing with the new slot.
+	 *
+	 * FIXME: const-ify all uses of struct kvm_memory_slot.
 	 */
 	if (change != KVM_MR_DELETE)
-		kvm_mmu_slot_apply_flags(kvm, new);
+		kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
 }
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f5fef1868096..edc8cdcd786b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -4,6 +4,8 @@
 #include <linux/kvm_host.h>
 #include "kvm_cache_regs.h"
 
+#define MSR_IA32_CR_PAT_DEFAULT  0x0007040600070406ULL
+
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.exception.pending = false;
@@ -160,7 +162,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
 	struct x86_exception *exception);
 
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+					  int page_num);
 
 #define KVM_SUPPORTED_XCR0     (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
 				| XSTATE_BNDREGS | XSTATE_BNDCSR \
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 8f9a133cc099..f2dc08c003eb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -70,7 +70,7 @@
 #include <asm/e820.h>
 #include <asm/mce.h>
 #include <asm/io.h>
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 #include <asm/kvm_para.h>
@@ -90,7 +90,7 @@ struct lguest_data lguest_data = {
 	.noirq_iret = (u32)lguest_noirq_iret,
 	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = SYSCALL_VECTOR,
+	.syscall_vec = IA32_SYSCALL_VECTOR,
 };
 
 /*G:037
@@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void)
 	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
 		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
 		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
-		if (i != SYSCALL_VECTOR)
+		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, irq_entries_start +
 					8 * (i - FIRST_EXTERNAL_VECTOR));
 	}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 1530afb07c85..f2587888d987 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,7 +17,6 @@ clean-files := inat-tables.c
 obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
 
 lib-y := delay.o misc.o cmdline.o
-lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
@@ -40,6 +39,6 @@ else
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o copy_user_nocache_64.o
+        lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 00933d5e992f..9b0ca8fe80fc 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -11,26 +11,23 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 /* if you want SMP support, implement these with real spinlocks */
 .macro LOCK reg
-	pushfl_cfi
+	pushfl
 	cli
 .endm
 
 .macro UNLOCK reg
-	popfl_cfi
+	popfl
 .endm
 
 #define BEGIN(op) \
 .macro endp; \
-	CFI_ENDPROC; \
 ENDPROC(atomic64_##op##_386); \
 .purgem endp; \
 .endm; \
 ENTRY(atomic64_##op##_386); \
-	CFI_STARTPROC; \
 	LOCK v;
 
 #define ENDP endp
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 082a85167a5b..db3ae85440ff 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -11,7 +11,6 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 .macro read64 reg
 	movl %ebx, %eax
@@ -22,16 +21,11 @@
 .endm
 
 ENTRY(atomic64_read_cx8)
-	CFI_STARTPROC
-
 	read64 %ecx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_read_cx8)
 
 ENTRY(atomic64_set_cx8)
-	CFI_STARTPROC
-
 1:
 /* we don't need LOCK_PREFIX since aligned 64-bit writes
  * are atomic on 586 and newer */
@@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8)
 	jne 1b
 
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_set_cx8)
 
 ENTRY(atomic64_xchg_cx8)
-	CFI_STARTPROC
-
 1:
 	LOCK_PREFIX
 	cmpxchg8b (%esi)
 	jne 1b
 
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_xchg_cx8)
 
 .macro addsub_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebp
-	pushl_cfi_reg ebx
-	pushl_cfi_reg esi
-	pushl_cfi_reg edi
+	pushl %ebp
+	pushl %ebx
+	pushl %esi
+	pushl %edi
 
 	movl %eax, %esi
 	movl %edx, %edi
@@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg edi
-	popl_cfi_reg esi
-	popl_cfi_reg ebx
-	popl_cfi_reg ebp
+	popl %edi
+	popl %esi
+	popl %ebx
+	popl %ebp
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
 .endm
 
@@ -93,8 +81,7 @@ addsub_return sub sub sbb
 
 .macro incdec_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
 .endm
 
@@ -119,8 +105,7 @@ incdec_return inc add adc
 incdec_return dec sub sbb
 
 ENTRY(atomic64_dec_if_positive_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8)
 2:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_dec_if_positive_cx8)
 
 ENTRY(atomic64_add_unless_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebp
-	pushl_cfi_reg ebx
+	pushl %ebp
+	pushl %ebx
 /* these just push these two parameters on the stack */
-	pushl_cfi_reg edi
-	pushl_cfi_reg ecx
+	pushl %edi
+	pushl %ecx
 
 	movl %eax, %ebp
 	movl %edx, %edi
@@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8)
 	movl $1, %eax
 3:
 	addl $8, %esp
-	CFI_ADJUST_CFA_OFFSET -8
-	popl_cfi_reg ebx
-	popl_cfi_reg ebp
+	popl %ebx
+	popl %ebp
 	ret
 4:
 	cmpl %edx, 4(%esp)
 	jne 2b
 	xorl %eax, %eax
 	jmp 3b
-	CFI_ENDPROC
 ENDPROC(atomic64_add_unless_cx8)
 
 ENTRY(atomic64_inc_not_zero_cx8)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
+	pushl %ebx
 
 	read64 %esi
 1:
@@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
 	movl $1, %eax
 3:
-	popl_cfi_reg ebx
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 9bc944a91274..c1e623209853 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -26,7 +26,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 				
@@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	   * alignment for the unrolled loop.
 	   */		
 ENTRY(csum_partial)
-	CFI_STARTPROC
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %esi
+	pushl %ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg: unsigned char *buff
@@ -129,10 +127,9 @@ ENTRY(csum_partial)
 	jz 8f
 	roll $8, %eax
 8:
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
+	popl %ebx
+	popl %esi
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial)
 
 #else
@@ -140,9 +137,8 @@ ENDPROC(csum_partial)
 /* Version for PentiumII/PPro */
 
 ENTRY(csum_partial)
-	CFI_STARTPROC
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %esi
+	pushl %ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
@@ -249,10 +245,9 @@ ENTRY(csum_partial)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
+	popl %ebx
+	popl %esi
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial)
 				
 #endif
@@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
 #define FP		12
 		
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
 	subl  $4,%esp	
-	CFI_ADJUST_CFA_OFFSET 4
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
-	pushl_cfi_reg ebx
+	pushl %edi
+	pushl %esi
+	pushl %ebx
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
 	movl ARGBASE+4(%esp),%esi	# src
@@ -401,12 +394,11 @@ DST(	movb %cl, (%edi)	)
 
 .previous
 
-	popl_cfi_reg ebx
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi %ecx			# equivalent to addl $4,%esp
+	popl %ebx
+	popl %esi
+	popl %edi
+	popl %ecx			# equivalent to addl $4,%esp
 	ret	
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
 
 #else
@@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic)
 #define ARGBASE 12
 		
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
-	pushl_cfi_reg edi
-	pushl_cfi_reg esi
+	pushl %ebx
+	pushl %edi
+	pushl %esi
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
 	movl ARGBASE+12(%esp),%ecx	#len
@@ -489,11 +480,10 @@ DST(	movb %dl, (%edi)         )
 	jmp  7b			
 .previous				
 
-	popl_cfi_reg esi
-	popl_cfi_reg edi
-	popl_cfi_reg ebx
+	popl %esi
+	popl %edi
+	popl %ebx
 	ret
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
 				
 #undef ROUND
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index e67e579c93bd..a2fe51b00cce 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,4 @@
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -15,7 +14,6 @@
  * %rdi	- page
  */
 ENTRY(clear_page)
-	CFI_STARTPROC
 
 	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
 		      "jmp clear_page_c_e", X86_FEATURE_ERMS
@@ -24,11 +22,9 @@ ENTRY(clear_page)
 	xorl %eax,%eax
 	rep stosq
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page)
 
 ENTRY(clear_page_orig)
-	CFI_STARTPROC
 
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
@@ -48,14 +44,11 @@ ENTRY(clear_page_orig)
 	jnz	.Lloop
 	nop
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page_orig)
 
 ENTRY(clear_page_c_e)
-	CFI_STARTPROC
 	movl $4096,%ecx
 	xorl %eax,%eax
 	rep stosb
 	ret
-	CFI_ENDPROC
 ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 40a172541ee2..9b330242e740 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -6,7 +6,6 @@
  *
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/percpu.h>
 
 .text
@@ -21,7 +20,6 @@
  * %al  : Operation successful
  */
 ENTRY(this_cpu_cmpxchg16b_emu)
-CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
@@ -32,7 +30,7 @@ CFI_STARTPROC
 # *atomic* on a single cpu (as provided by the this_cpu_xx class of
 # macros).
 #
-	pushfq_cfi
+	pushfq
 	cli
 
 	cmpq PER_CPU_VAR((%rsi)), %rax
@@ -43,17 +41,13 @@ CFI_STARTPROC
 	movq %rbx, PER_CPU_VAR((%rsi))
 	movq %rcx, PER_CPU_VAR(8(%rsi))
 
-	CFI_REMEMBER_STATE
-	popfq_cfi
+	popfq
 	mov $1, %al
 	ret
 
-	CFI_RESTORE_STATE
 .Lnot_same:
-	popfq_cfi
+	popfq
 	xor %al,%al
 	ret
 
-CFI_ENDPROC
-
 ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index b4807fce5177..ad5349778490 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,7 +7,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 .text
 
@@ -20,14 +19,13 @@
  * %ecx : high 32 bits of new value
  */
 ENTRY(cmpxchg8b_emu)
-CFI_STARTPROC
 
 #
 # Emulate 'cmpxchg8b (%esi)' on UP except we don't
 # set the whole ZF thing (caller will just compare
 # eax:edx with the expected value)
 #
-	pushfl_cfi
+	pushfl
 	cli
 
 	cmpl  (%esi), %eax
@@ -38,18 +36,15 @@ CFI_STARTPROC
 	movl %ebx,  (%esi)
 	movl %ecx, 4(%esi)
 
-	CFI_REMEMBER_STATE
-	popfl_cfi
+	popfl
 	ret
 
-	CFI_RESTORE_STATE
 .Lnot_same:
 	movl  (%esi), %eax
 .Lhalf_same:
 	movl 4(%esi), %edx
 
-	popfl_cfi
+	popfl
 	ret
 
-CFI_ENDPROC
 ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 8239dbcbf984..009f98216b7e 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,6 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -13,22 +12,16 @@
  */
 	ALIGN
 ENTRY(copy_page)
-	CFI_STARTPROC
 	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
-	CFI_ENDPROC
 ENDPROC(copy_page)
 
 ENTRY(copy_page_regs)
-	CFI_STARTPROC
 	subq	$2*8,	%rsp
-	CFI_ADJUST_CFA_OFFSET 2*8
 	movq	%rbx,	(%rsp)
-	CFI_REL_OFFSET rbx, 0
 	movq	%r12,	1*8(%rsp)
-	CFI_REL_OFFSET r12, 1*8
 
 	movl	$(4096/64)-5,	%ecx
 	.p2align 4
@@ -87,11 +80,7 @@ ENTRY(copy_page_regs)
 	jnz	.Loop2
 
 	movq	(%rsp), %rbx
-	CFI_RESTORE rbx
 	movq	1*8(%rsp), %r12
-	CFI_RESTORE r12
 	addq	$2*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
-	CFI_ENDPROC
 ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index fa997dfaef24..982ce34f4a9b 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -7,7 +7,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
@@ -16,33 +15,8 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-	.macro ALIGN_DESTINATION
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-	.endm
-
 /* Standard copy_to_user with segment limit checking */
 ENTRY(_copy_to_user)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rdi,%rcx
 	addq %rdx,%rcx
@@ -54,12 +28,10 @@ ENTRY(_copy_to_user)
 		      X86_FEATURE_REP_GOOD,			\
 		      "jmp copy_user_enhanced_fast_string",	\
 		      X86_FEATURE_ERMS
-	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
 /* Standard copy_from_user with segment limit checking */
 ENTRY(_copy_from_user)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%rax)
 	movq %rsi,%rcx
 	addq %rdx,%rcx
@@ -71,14 +43,12 @@ ENTRY(_copy_from_user)
 		      X86_FEATURE_REP_GOOD,			\
 		      "jmp copy_user_enhanced_fast_string",	\
 		      X86_FEATURE_ERMS
-	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
 	.section .fixup,"ax"
 	/* must zero dest */
 ENTRY(bad_from_user)
 bad_from_user:
-	CFI_STARTPROC
 	movl %edx,%ecx
 	xorl %eax,%eax
 	rep
@@ -86,7 +56,6 @@ bad_from_user:
 bad_to_user:
 	movl %edx,%eax
 	ret
-	CFI_ENDPROC
 ENDPROC(bad_from_user)
 	.previous
 
@@ -104,7 +73,6 @@ ENDPROC(bad_from_user)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_unrolled)
-	CFI_STARTPROC
 	ASM_STAC
 	cmpl $8,%edx
 	jb 20f		/* less then 8 bytes, go to byte copy loop */
@@ -186,7 +154,6 @@ ENTRY(copy_user_generic_unrolled)
 	_ASM_EXTABLE(19b,40b)
 	_ASM_EXTABLE(21b,50b)
 	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
 ENDPROC(copy_user_generic_unrolled)
 
 /* Some CPUs run faster using the string copy instructions.
@@ -208,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_generic_string)
-	CFI_STARTPROC
 	ASM_STAC
 	cmpl $8,%edx
 	jb 2f		/* less than 8 bytes, go to byte copy loop */
@@ -233,7 +199,6 @@ ENTRY(copy_user_generic_string)
 
 	_ASM_EXTABLE(1b,11b)
 	_ASM_EXTABLE(3b,12b)
-	CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
 
 /*
@@ -249,7 +214,6 @@ ENDPROC(copy_user_generic_string)
  * eax uncopied bytes or 0 if successful.
  */
 ENTRY(copy_user_enhanced_fast_string)
-	CFI_STARTPROC
 	ASM_STAC
 	movl %edx,%ecx
 1:	rep
@@ -264,5 +228,94 @@ ENTRY(copy_user_enhanced_fast_string)
 	.previous
 
 	_ASM_EXTABLE(1b,12b)
-	CFI_ENDPROC
 ENDPROC(copy_user_enhanced_fast_string)
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(__copy_user_nocache)
+	ASM_STAC
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+	ALIGN_DESTINATION
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+5:	movnti %r8,(%rdi)
+6:	movnti %r9,1*8(%rdi)
+7:	movnti %r10,2*8(%rdi)
+8:	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+13:	movnti %r8,4*8(%rdi)
+14:	movnti %r9,5*8(%rdi)
+15:	movnti %r10,6*8(%rdi)
+16:	movnti %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+19:	movnti %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+22:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	ASM_CLAC
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 60f
+40:	lea (%rdx,%rcx,8),%rdx
+	jmp 60f
+50:	movl %ecx,%edx
+60:	sfence
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
+ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
deleted file mode 100644
index 6a4f43c2d9e6..000000000000
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-#endif
-	.endm
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- */
-ENTRY(__copy_user_nocache)
-	CFI_STARTPROC
-	ASM_STAC
-	cmpl $8,%edx
-	jb 20f		/* less then 8 bytes, go to byte copy loop */
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz 17f
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movnti %r8,(%rdi)
-6:	movnti %r9,1*8(%rdi)
-7:	movnti %r10,2*8(%rdi)
-8:	movnti %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movnti %r8,4*8(%rdi)
-14:	movnti %r9,5*8(%rdi)
-15:	movnti %r10,6*8(%rdi)
-16:	movnti %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz 1b
-17:	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz 20f
-18:	movq (%rsi),%r8
-19:	movnti %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz 18b
-20:	andl %edx,%edx
-	jz 23f
-	movl %edx,%ecx
-21:	movb (%rsi),%al
-22:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 21b
-23:	xorl %eax,%eax
-	ASM_CLAC
-	sfence
-	ret
-
-	.section .fixup,"ax"
-30:	shll $6,%ecx
-	addl %ecx,%edx
-	jmp 60f
-40:	lea (%rdx,%rcx,8),%rdx
-	jmp 60f
-50:	movl %ecx,%edx
-60:	sfence
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(1b,30b)
-	_ASM_EXTABLE(2b,30b)
-	_ASM_EXTABLE(3b,30b)
-	_ASM_EXTABLE(4b,30b)
-	_ASM_EXTABLE(5b,30b)
-	_ASM_EXTABLE(6b,30b)
-	_ASM_EXTABLE(7b,30b)
-	_ASM_EXTABLE(8b,30b)
-	_ASM_EXTABLE(9b,30b)
-	_ASM_EXTABLE(10b,30b)
-	_ASM_EXTABLE(11b,30b)
-	_ASM_EXTABLE(12b,30b)
-	_ASM_EXTABLE(13b,30b)
-	_ASM_EXTABLE(14b,30b)
-	_ASM_EXTABLE(15b,30b)
-	_ASM_EXTABLE(16b,30b)
-	_ASM_EXTABLE(18b,40b)
-	_ASM_EXTABLE(19b,40b)
-	_ASM_EXTABLE(21b,50b)
-	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
-ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 9734182966f3..7e48807b2fa1 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -6,7 +6,6 @@
  * for more details. No warranty for anything given at all.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 
@@ -47,23 +46,16 @@
 
 
 ENTRY(csum_partial_copy_generic)
-	CFI_STARTPROC
 	cmpl	$3*64, %edx
 	jle	.Lignore
 
 .Lignore:
 	subq  $7*8, %rsp
-	CFI_ADJUST_CFA_OFFSET 7*8
 	movq  %rbx, 2*8(%rsp)
-	CFI_REL_OFFSET rbx, 2*8
 	movq  %r12, 3*8(%rsp)
-	CFI_REL_OFFSET r12, 3*8
 	movq  %r14, 4*8(%rsp)
-	CFI_REL_OFFSET r14, 4*8
 	movq  %r13, 5*8(%rsp)
-	CFI_REL_OFFSET r13, 5*8
 	movq  %rbp, 6*8(%rsp)
-	CFI_REL_OFFSET rbp, 6*8
 
 	movq  %r8, (%rsp)
 	movq  %r9, 1*8(%rsp)
@@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic)
 	addl %ebx, %eax
 	adcl %r9d, %eax		/* carry */
 
-	CFI_REMEMBER_STATE
 .Lende:
 	movq 2*8(%rsp), %rbx
-	CFI_RESTORE rbx
 	movq 3*8(%rsp), %r12
-	CFI_RESTORE r12
 	movq 4*8(%rsp), %r14
-	CFI_RESTORE r14
 	movq 5*8(%rsp), %r13
-	CFI_RESTORE r13
 	movq 6*8(%rsp), %rbp
-	CFI_RESTORE rbp
 	addq $7*8, %rsp
-	CFI_ADJUST_CFA_OFFSET -7*8
 	ret
-	CFI_RESTORE_STATE
 
 	/* Exception handlers. Very simple, zeroing is done in the wrappers */
 .Lbad_source:
@@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic)
 	jz   .Lende
 	movl $-EFAULT, (%rax)
 	jmp .Lende
-	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index a4512359656a..46668cda4ffd 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -26,7 +26,6 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/page_types.h>
 #include <asm/errno.h>
 #include <asm/asm-offsets.h>
@@ -36,7 +35,6 @@
 
 	.text
 ENTRY(__get_user_1)
-	CFI_STARTPROC
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
@@ -45,11 +43,9 @@ ENTRY(__get_user_1)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_1)
 
 ENTRY(__get_user_2)
-	CFI_STARTPROC
 	add $1,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
@@ -60,11 +56,9 @@ ENTRY(__get_user_2)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_2)
 
 ENTRY(__get_user_4)
-	CFI_STARTPROC
 	add $3,%_ASM_AX
 	jc bad_get_user
 	GET_THREAD_INFO(%_ASM_DX)
@@ -75,11 +69,9 @@ ENTRY(__get_user_4)
 	xor %eax,%eax
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 ENDPROC(__get_user_4)
 
 ENTRY(__get_user_8)
-	CFI_STARTPROC
 #ifdef CONFIG_X86_64
 	add $7,%_ASM_AX
 	jc bad_get_user
@@ -104,28 +96,23 @@ ENTRY(__get_user_8)
 	ASM_CLAC
 	ret
 #endif
-	CFI_ENDPROC
 ENDPROC(__get_user_8)
 
 
 bad_get_user:
-	CFI_STARTPROC
 	xor %edx,%edx
 	mov $(-EFAULT),%_ASM_AX
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 END(bad_get_user)
 
 #ifdef CONFIG_X86_32
 bad_get_user_8:
-	CFI_STARTPROC
 	xor %edx,%edx
 	xor %ecx,%ecx
 	mov $(-EFAULT),%_ASM_AX
 	ASM_CLAC
 	ret
-	CFI_ENDPROC
 END(bad_get_user_8)
 #endif
 
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index 05a95e713da8..33147fef3452 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -16,15 +16,12 @@
  */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 /*
  * override generic version in lib/iomap_copy.c
  */
 ENTRY(__iowrite32_copy)
-	CFI_STARTPROC
 	movl %edx,%ecx
 	rep movsd
 	ret
-	CFI_ENDPROC
 ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index b046664f5a1c..16698bba87de 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -2,7 +2,6 @@
 
 #include <linux/linkage.h>
 #include <asm/cpufeature.h>
-#include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
 /*
@@ -53,7 +52,6 @@ ENTRY(memcpy_erms)
 ENDPROC(memcpy_erms)
 
 ENTRY(memcpy_orig)
-	CFI_STARTPROC
 	movq %rdi, %rax
 
 	cmpq $0x20, %rdx
@@ -178,5 +176,4 @@ ENTRY(memcpy_orig)
 
 .Lend:
 	retq
-	CFI_ENDPROC
 ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0f8a0d0331b9..ca2afdd6d98e 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,6 @@
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -27,7 +26,6 @@
 
 ENTRY(memmove)
 ENTRY(__memmove)
-	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
 	mov %rdi, %rax
@@ -207,6 +205,5 @@ ENTRY(__memmove)
 	movb %r11b, (%rdi)
 13:
 	retq
-	CFI_ENDPROC
 ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 93118fb23976..2661fad05827 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,6 @@
 /* Copyright 2002 Andi Kleen, SuSE Labs */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
@@ -66,7 +65,6 @@ ENTRY(memset_erms)
 ENDPROC(memset_erms)
 
 ENTRY(memset_orig)
-	CFI_STARTPROC
 	movq %rdi,%r10
 
 	/* expand byte value  */
@@ -78,7 +76,6 @@ ENTRY(memset_orig)
 	movl  %edi,%r9d
 	andl  $7,%r9d
 	jnz  .Lbad_alignment
-	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
 	movq  %rdx,%rcx
@@ -128,7 +125,6 @@ ENTRY(memset_orig)
 	movq	%r10,%rax
 	ret
 
-	CFI_RESTORE_STATE
 .Lbad_alignment:
 	cmpq $7,%rdx
 	jbe	.Lhandle_7
@@ -139,5 +135,4 @@ ENTRY(memset_orig)
 	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
-	CFI_ENDPROC
 ENDPROC(memset_orig)
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index c9f2d9ba8dd8..e5e3ed8dc079 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -22,7 +22,7 @@
 #include <linux/sched.h>
 #include <linux/types.h>
 
-#include <asm/i387.h>
+#include <asm/fpu/api.h>
 #include <asm/asm.h>
 
 void *_mmx_memcpy(void *to, const void *from, size_t len)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 3ca5218fbece..c81556409bbb 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -1,6 +1,5 @@
 #include <linux/linkage.h>
 #include <linux/errno.h>
-#include <asm/dwarf2.h>
 #include <asm/asm.h>
 #include <asm/msr.h>
 
@@ -13,9 +12,8 @@
  */
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
-	CFI_STARTPROC
-	pushq_cfi_reg rbx
-	pushq_cfi_reg rbp
+	pushq %rbx
+	pushq %rbp
 	movq	%rdi, %r10	/* Save pointer */
 	xorl	%r11d, %r11d	/* Return value */
 	movl    (%rdi), %eax
@@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs)
 	movl    20(%rdi), %ebp
 	movl    24(%rdi), %esi
 	movl    28(%rdi), %edi
-	CFI_REMEMBER_STATE
 1:	\op
 2:	movl    %eax, (%r10)
 	movl	%r11d, %eax	/* Return value */
@@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs)
 	movl    %ebp, 20(%r10)
 	movl    %esi, 24(%r10)
 	movl    %edi, 28(%r10)
-	popq_cfi_reg rbp
-	popq_cfi_reg rbx
+	popq %rbp
+	popq %rbx
 	ret
 3:
-	CFI_RESTORE_STATE
 	movl    $-EIO, %r11d
 	jmp     2b
 
 	_ASM_EXTABLE(1b, 3b)
-	CFI_ENDPROC
 ENDPROC(\op\()_safe_regs)
 .endm
 
@@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs)
 
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
-	CFI_STARTPROC
-	pushl_cfi_reg ebx
-	pushl_cfi_reg ebp
-	pushl_cfi_reg esi
-	pushl_cfi_reg edi
-	pushl_cfi $0              /* Return value */
-	pushl_cfi %eax
+	pushl %ebx
+	pushl %ebp
+	pushl %esi
+	pushl %edi
+	pushl $0              /* Return value */
+	pushl %eax
 	movl    4(%eax), %ecx
 	movl    8(%eax), %edx
 	movl    12(%eax), %ebx
@@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs)
 	movl    24(%eax), %esi
 	movl    28(%eax), %edi
 	movl    (%eax), %eax
-	CFI_REMEMBER_STATE
 1:	\op
-2:	pushl_cfi %eax
+2:	pushl %eax
 	movl    4(%esp), %eax
-	popl_cfi (%eax)
+	popl (%eax)
 	addl    $4, %esp
-	CFI_ADJUST_CFA_OFFSET -4
 	movl    %ecx, 4(%eax)
 	movl    %edx, 8(%eax)
 	movl    %ebx, 12(%eax)
 	movl    %ebp, 20(%eax)
 	movl    %esi, 24(%eax)
 	movl    %edi, 28(%eax)
-	popl_cfi %eax
-	popl_cfi_reg edi
-	popl_cfi_reg esi
-	popl_cfi_reg ebp
-	popl_cfi_reg ebx
+	popl %eax
+	popl %edi
+	popl %esi
+	popl %ebp
+	popl %ebx
 	ret
 3:
-	CFI_RESTORE_STATE
 	movl    $-EIO, 4(%esp)
 	jmp     2b
 
 	_ASM_EXTABLE(1b, 3b)
-	CFI_ENDPROC
 ENDPROC(\op\()_safe_regs)
 .endm
 
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index fc6ba17a7eec..e0817a12d323 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -11,7 +11,6 @@
  * return value.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 #include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
@@ -30,11 +29,9 @@
  * as they get called from within inline assembly.
  */
 
-#define ENTER	CFI_STARTPROC ; \
-		GET_THREAD_INFO(%_ASM_BX)
+#define ENTER	GET_THREAD_INFO(%_ASM_BX)
 #define EXIT	ASM_CLAC ;	\
-		ret ;		\
-		CFI_ENDPROC
+		ret
 
 .text
 ENTRY(__put_user_1)
@@ -87,7 +84,6 @@ ENTRY(__put_user_8)
 ENDPROC(__put_user_8)
 
 bad_put_user:
-	CFI_STARTPROC
 	movl $-EFAULT,%eax
 	EXIT
 END(bad_put_user)
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 2322abe4da3b..40027db99140 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -15,7 +15,6 @@
 
 #include <linux/linkage.h>
 #include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
 
 #define __ASM_HALF_REG(reg)	__ASM_SEL(reg, e##reg)
 #define __ASM_HALF_SIZE(inst)	__ASM_SEL(inst##w, inst##l)
@@ -34,10 +33,10 @@
  */
 
 #define save_common_regs \
-	pushl_cfi_reg ecx
+	pushl %ecx
 
 #define restore_common_regs \
-	popl_cfi_reg ecx
+	popl %ecx
 
 	/* Avoid uglifying the argument copying x86-64 needs to do. */
 	.macro movq src, dst
@@ -64,50 +63,45 @@
  */
 
 #define save_common_regs \
-	pushq_cfi_reg rdi; \
-	pushq_cfi_reg rsi; \
-	pushq_cfi_reg rcx; \
-	pushq_cfi_reg r8;  \
-	pushq_cfi_reg r9;  \
-	pushq_cfi_reg r10; \
-	pushq_cfi_reg r11
+	pushq %rdi; \
+	pushq %rsi; \
+	pushq %rcx; \
+	pushq %r8;  \
+	pushq %r9;  \
+	pushq %r10; \
+	pushq %r11
 
 #define restore_common_regs \
-	popq_cfi_reg r11; \
-	popq_cfi_reg r10; \
-	popq_cfi_reg r9; \
-	popq_cfi_reg r8; \
-	popq_cfi_reg rcx; \
-	popq_cfi_reg rsi; \
-	popq_cfi_reg rdi
+	popq %r11; \
+	popq %r10; \
+	popq %r9; \
+	popq %r8; \
+	popq %rcx; \
+	popq %rsi; \
+	popq %rdi
 
 #endif
 
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
-	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_down_read_failed)
 
 ENTRY(call_rwsem_down_write_failed)
-	CFI_STARTPROC
 	save_common_regs
 	movq %rax,%rdi
 	call rwsem_down_write_failed
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
-	CFI_STARTPROC
 	/* do nothing if still outstanding active readers */
 	__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
 	jnz 1f
@@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake)
 	call rwsem_wake
 	restore_common_regs
 1:	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_wake)
 
 ENTRY(call_rwsem_downgrade_wake)
-	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(push,) %__ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+	__ASM_SIZE(pop,) %__ASM_REG(dx)
 	restore_common_regs
 	ret
-	CFI_ENDPROC
 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e2f5e21c03b3..91d93b95bd86 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -647,7 +647,8 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
  * @from: Source address, in kernel space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from kernel space to user space.
  *
@@ -668,7 +669,8 @@ EXPORT_SYMBOL(_copy_to_user);
  * @from: Source address, in user space.
  * @n:    Number of bytes to copy.
  *
- * Context: User context only.  This function may sleep.
+ * Context: User context only. This function may sleep if pagefaults are
+ *          enabled.
  *
  * Copy data from user space to kernel space.
  *
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index dc8adad10a2f..dd76a05729b0 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,7 +30,7 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit_soft_fpu(struct i387_soft_struct *soft)
+void fpstate_init_soft(struct swregs_state *soft)
 {
 	struct address *oaddr, *iaddr;
 	memset(soft, 0, sizeof(*soft));
@@ -52,7 +52,7 @@ void finit_soft_fpu(struct i387_soft_struct *soft)
 
 void finit(void)
 {
-	finit_soft_fpu(&current->thread.fpu.state->soft);
+	fpstate_init_soft(&current->thread.fpu.state.soft);
 }
 
 /*
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 9b868124128d..f37e84ab49f3 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -31,7 +31,7 @@
 #include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/user.h>
-#include <asm/i387.h>
+#include <asm/fpu/internal.h>
 
 #include "fpu_system.h"
 #include "fpu_emu.h"
@@ -147,13 +147,9 @@ void math_emulate(struct math_emu_info *info)
 	unsigned long code_base = 0;
 	unsigned long code_limit = 0;	/* Initialized to stop compiler warnings */
 	struct desc_struct code_descriptor;
+	struct fpu *fpu = &current->thread.fpu;
 
-	if (!used_math()) {
-		if (init_fpu(current)) {
-			do_group_exit(SIGKILL);
-			return;
-		}
-	}
+	fpu__activate_curr(fpu);
 
 #ifdef RE_ENTRANT_CHECKING
 	if (emulating) {
@@ -673,7 +669,7 @@ void math_abort(struct math_emu_info *info, unsigned int signal)
 #endif /* PARANOID */
 }
 
-#define S387 ((struct i387_soft_struct *)s387)
+#define S387 ((struct swregs_state *)s387)
 #define sstatus_word() \
   ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top))
 
@@ -682,14 +678,14 @@ int fpregs_soft_set(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    const void *kbuf, const void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	void *space = s387->st_space;
 	int ret;
 	int offset, other, i, tags, regnr, tag, newtop;
 
 	RE_ENTRANT_CHECK_OFF;
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0,
-				 offsetof(struct i387_soft_struct, st_space));
+				 offsetof(struct swregs_state, st_space));
 	RE_ENTRANT_CHECK_ON;
 
 	if (ret)
@@ -734,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target,
 		    unsigned int pos, unsigned int count,
 		    void *kbuf, void __user *ubuf)
 {
-	struct i387_soft_struct *s387 = &target->thread.fpu.state->soft;
+	struct swregs_state *s387 = &target->thread.fpu.state.soft;
 	const void *space = s387->st_space;
 	int ret;
 	int offset = (S387->ftop & 7) * 10, other = 80 - offset;
@@ -752,7 +748,7 @@ int fpregs_soft_get(struct task_struct *target,
 #endif /* PECULIAR_486 */
 
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0,
-				  offsetof(struct i387_soft_struct, st_space));
+				  offsetof(struct swregs_state, st_space));
 
 	/* Copy all registers in stack order. */
 	if (!ret)
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 2c614410a5f3..9ccecb61a4fa 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -31,7 +31,7 @@
 #define SEG_EXPAND_DOWN(s)	(((s).b & ((1 << 11) | (1 << 10))) \
 				 == (1 << 10))
 
-#define I387			(current->thread.fpu.state)
+#define I387			(&current->thread.fpu.state)
 #define FPU_info		(I387->soft.info)
 
 #define FPU_CS			(*(unsigned short *) &(FPU_info->regs->cs))
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 181c53bac3a7..9dc909841739 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,6 +13,7 @@
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 #include <linux/prefetch.h>		/* prefetchw			*/
 #include <linux/context_tracking.h>	/* exception_enter(), ...	*/
+#include <linux/uaccess.h>		/* faulthandler_disabled()	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
@@ -1126,9 +1127,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
-	 * in an atomic region then we must not take the fault:
+	 * in a region with pagefaults disabled then we must not take the fault
 	 */
-	if (unlikely(in_atomic() || !mm)) {
+	if (unlikely(faulthandler_disabled() || !mm)) {
 		bad_area_nosemaphore(regs, error_code, address);
 		return;
 	}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 4500142bc4aa..eecb207a2037 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -35,7 +35,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
-	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+	preempt_disable();
 	pagefault_disable();
 
 	if (!PageHighMem(page))
@@ -100,6 +100,7 @@ void __kunmap_atomic(void *kvaddr)
 #endif
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1d553186c434..8533b46e6bee 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -40,7 +40,7 @@
  */
 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
 	[_PAGE_CACHE_MODE_WB      ]	= 0         | 0        ,
-	[_PAGE_CACHE_MODE_WC      ]	= _PAGE_PWT | 0        ,
+	[_PAGE_CACHE_MODE_WC      ]	= 0         | _PAGE_PCD,
 	[_PAGE_CACHE_MODE_UC_MINUS]	= 0         | _PAGE_PCD,
 	[_PAGE_CACHE_MODE_UC      ]	= _PAGE_PWT | _PAGE_PCD,
 	[_PAGE_CACHE_MODE_WT      ]	= 0         | _PAGE_PCD,
@@ -50,11 +50,11 @@ EXPORT_SYMBOL(__cachemode2pte_tbl);
 
 uint8_t __pte2cachemode_tbl[8] = {
 	[__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
-	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
 	[__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
-	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
 };
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9ca35fc60cfe..9c0ff045fdd4 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -59,6 +59,7 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	unsigned long vaddr;
 	int idx, type;
 
+	preempt_disable();
 	pagefault_disable();
 
 	type = kmap_atomic_idx_push();
@@ -77,13 +78,13 @@ void __iomem *
 iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 {
 	/*
-	 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
-	 * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
-	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
-	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
+	 * For non-PAT systems, translate non-WB request to UC- just in
+	 * case the caller set the PWT bit to prot directly without using
+	 * pgprot_writecombine(). UC- translates to uncached if the MTRR
+	 * is UC or WC. UC- gets the real intention, of the user, which is
+	 * "WC if the MTRR is WC, UC if you can't do that."
 	 */
-	if (!pat_enabled && pgprot_val(prot) ==
-	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+	if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
 		prot = __pgprot(__PAGE_KERNEL |
 				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
 
@@ -117,5 +118,6 @@ iounmap_atomic(void __iomem *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 70e7444c6835..cc5ccc415cc0 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -42,6 +42,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	case _PAGE_CACHE_MODE_WC:
 		err = _set_memory_wc(vaddr, nrpages);
 		break;
+	case _PAGE_CACHE_MODE_WT:
+		err = _set_memory_wt(vaddr, nrpages);
+		break;
 	case _PAGE_CACHE_MODE_WB:
 		err = _set_memory_wb(vaddr, nrpages);
 		break;
@@ -172,6 +175,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		prot = __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
 		break;
+	case _PAGE_CACHE_MODE_WT:
+		prot = __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WT));
+		break;
 	case _PAGE_CACHE_MODE_WB:
 		break;
 	}
@@ -234,10 +241,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
 	/*
 	 * Ideally, this should be:
-	 *	pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+	 *	pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
 	 *
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
-	 * UC MINUS.
+	 * UC MINUS. Drivers that are certain they need or can already
+	 * be converted over to strong UC can use ioremap_uc().
 	 */
 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -247,6 +255,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 EXPORT_SYMBOL(ioremap_nocache);
 
 /**
+ * ioremap_uc     -   map bus memory into CPU space as strongly uncachable
+ * @phys_addr:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC.  This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+	return __ioremap_caller(phys_addr, size, pcm,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
  * ioremap_wc	-	map memory into CPU space write combined
  * @phys_addr:	bus address of the memory
  * @size:	size of the resource to map
@@ -258,14 +299,28 @@ EXPORT_SYMBOL(ioremap_nocache);
  */
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
-	if (pat_enabled)
-		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
 					__builtin_return_address(0));
-	else
-		return ioremap_nocache(phys_addr, size);
 }
 EXPORT_SYMBOL(ioremap_wc);
 
+/**
+ * ioremap_wt	-	map memory into CPU space write through
+ * @phys_addr:	bus address of the memory
+ * @size:	size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+					__builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
 	return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
@@ -331,7 +386,7 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
 {
 #ifdef CONFIG_X86_64
 	return cpu_has_gbpages;
@@ -340,7 +395,7 @@ int arch_ioremap_pud_supported(void)
 #endif
 }
 
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
 {
 	return cpu_has_pse;
 }
@@ -353,18 +408,18 @@ void *xlate_dev_mem_ptr(phys_addr_t phys)
 {
 	unsigned long start  = phys &  PAGE_MASK;
 	unsigned long offset = phys & ~PAGE_MASK;
-	unsigned long vaddr;
+	void *vaddr;
 
 	/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
 	if (page_is_ram(start >> PAGE_SHIFT))
 		return __va(phys);
 
-	vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+	vaddr = ioremap_cache(start, PAGE_SIZE);
 	/* Only add the offset on success and return NULL if the ioremap() failed: */
 	if (vaddr)
 		vaddr += offset;
 
-	return (void *)vaddr;
+	return vaddr;
 }
 
 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
@@ -373,7 +428,6 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
 		return;
 
 	iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
-	return;
 }
 
 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index c439ec478216..7a657f58bbea 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -10,13 +10,15 @@
 #include <linux/syscalls.h>
 #include <linux/sched/sysctl.h>
 
-#include <asm/i387.h>
 #include <asm/insn.h>
 #include <asm/mman.h>
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
 #include <asm/processor.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/internal.h>
+
+#define CREATE_TRACE_POINTS
+#include <asm/trace/mpx.h>
 
 static const char *mpx_mapping_name(struct vm_area_struct *vma)
 {
@@ -32,6 +34,22 @@ static int is_mpx_vma(struct vm_area_struct *vma)
 	return (vma->vm_ops == &mpx_vma_ops);
 }
 
+static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
+{
+	if (is_64bit_mm(mm))
+		return MPX_BD_SIZE_BYTES_64;
+	else
+		return MPX_BD_SIZE_BYTES_32;
+}
+
+static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
+{
+	if (is_64bit_mm(mm))
+		return MPX_BT_SIZE_BYTES_64;
+	else
+		return MPX_BT_SIZE_BYTES_32;
+}
+
 /*
  * This is really a simplified "vm_mmap". it only handles MPX
  * bounds tables (the bounds directory is user-allocated).
@@ -47,8 +65,8 @@ static unsigned long mpx_mmap(unsigned long len)
 	vm_flags_t vm_flags;
 	struct vm_area_struct *vma;
 
-	/* Only bounds table and bounds directory can be allocated here */
-	if (len != MPX_BD_SIZE_BYTES && len != MPX_BT_SIZE_BYTES)
+	/* Only bounds table can be allocated here */
+	if (len != mpx_bt_size_bytes(mm))
 		return -EINVAL;
 
 	down_write(&mm->mmap_sem);
@@ -272,10 +290,9 @@ bad_opcode:
  *
  * The caller is expected to kfree() the returned siginfo_t.
  */
-siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
-				struct xsave_struct *xsave_buf)
+siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
-	struct bndreg *bndregs, *bndreg;
+	const struct bndreg *bndregs, *bndreg;
 	siginfo_t *info = NULL;
 	struct insn insn;
 	uint8_t bndregno;
@@ -295,8 +312,8 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
 		err = -EINVAL;
 		goto err_out;
 	}
-	/* get the bndregs _area_ of the xsave structure */
-	bndregs = get_xsave_addr(xsave_buf, XSTATE_BNDREGS);
+	/* get bndregs field from current task's xsave area */
+	bndregs = get_xsave_field_ptr(XSTATE_BNDREGS);
 	if (!bndregs) {
 		err = -EINVAL;
 		goto err_out;
@@ -334,6 +351,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs,
 		err = -EINVAL;
 		goto err_out;
 	}
+	trace_mpx_bounds_register_exception(info->si_addr, bndreg);
 	return info;
 err_out:
 	/* info might be NULL, but kfree() handles that */
@@ -341,25 +359,18 @@ err_out:
 	return ERR_PTR(err);
 }
 
-static __user void *task_get_bounds_dir(struct task_struct *tsk)
+static __user void *mpx_get_bounds_dir(void)
 {
-	struct bndcsr *bndcsr;
+	const struct bndcsr *bndcsr;
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX))
 		return MPX_INVALID_BOUNDS_DIR;
 
 	/*
-	 * 32-bit binaries on 64-bit kernels are currently
-	 * unsupported.
-	 */
-	if (IS_ENABLED(CONFIG_X86_64) && test_thread_flag(TIF_IA32))
-		return MPX_INVALID_BOUNDS_DIR;
-	/*
 	 * The bounds directory pointer is stored in a register
 	 * only accessible if we first do an xsave.
 	 */
-	fpu_save_init(&tsk->thread.fpu);
-	bndcsr = get_xsave_addr(&tsk->thread.fpu.state->xsave, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
 	if (!bndcsr)
 		return MPX_INVALID_BOUNDS_DIR;
 
@@ -378,10 +389,10 @@ static __user void *task_get_bounds_dir(struct task_struct *tsk)
 		(bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
 }
 
-int mpx_enable_management(struct task_struct *tsk)
+int mpx_enable_management(void)
 {
 	void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
-	struct mm_struct *mm = tsk->mm;
+	struct mm_struct *mm = current->mm;
 	int ret = 0;
 
 	/*
@@ -390,11 +401,12 @@ int mpx_enable_management(struct task_struct *tsk)
 	 * directory into XSAVE/XRSTOR Save Area and enable MPX through
 	 * XRSTOR instruction.
 	 *
-	 * fpu_xsave() is expected to be very expensive. Storing the bounds
-	 * directory here means that we do not have to do xsave in the unmap
-	 * path; we can just use mm->bd_addr instead.
+	 * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
+	 * expected to be relatively expensive. Storing the bounds
+	 * directory here means that we do not have to do xsave in the
+	 * unmap path; we can just use mm->bd_addr instead.
 	 */
-	bd_base = task_get_bounds_dir(tsk);
+	bd_base = mpx_get_bounds_dir();
 	down_write(&mm->mmap_sem);
 	mm->bd_addr = bd_base;
 	if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
@@ -404,7 +416,7 @@ int mpx_enable_management(struct task_struct *tsk)
 	return ret;
 }
 
-int mpx_disable_management(struct task_struct *tsk)
+int mpx_disable_management(void)
 {
 	struct mm_struct *mm = current->mm;
 
@@ -417,29 +429,59 @@ int mpx_disable_management(struct task_struct *tsk)
 	return 0;
 }
 
+static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
+		unsigned long *curval,
+		unsigned long __user *addr,
+		unsigned long old_val, unsigned long new_val)
+{
+	int ret;
+	/*
+	 * user_atomic_cmpxchg_inatomic() actually uses sizeof()
+	 * the pointer that we pass to it to figure out how much
+	 * data to cmpxchg.  We have to be careful here not to
+	 * pass a pointer to a 64-bit data type when we only want
+	 * a 32-bit copy.
+	 */
+	if (is_64bit_mm(mm)) {
+		ret = user_atomic_cmpxchg_inatomic(curval,
+				addr, old_val, new_val);
+	} else {
+		u32 uninitialized_var(curval_32);
+		u32 old_val_32 = old_val;
+		u32 new_val_32 = new_val;
+		u32 __user *addr_32 = (u32 __user *)addr;
+
+		ret = user_atomic_cmpxchg_inatomic(&curval_32,
+				addr_32, old_val_32, new_val_32);
+		*curval = curval_32;
+	}
+	return ret;
+}
+
 /*
- * With 32-bit mode, MPX_BT_SIZE_BYTES is 4MB, and the size of each
- * bounds table is 16KB. With 64-bit mode, MPX_BT_SIZE_BYTES is 2GB,
+ * With 32-bit mode, a bounds directory is 4MB, and the size of each
+ * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
  * and the size of each bounds table is 4MB.
  */
-static int allocate_bt(long __user *bd_entry)
+static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
 {
 	unsigned long expected_old_val = 0;
 	unsigned long actual_old_val = 0;
 	unsigned long bt_addr;
+	unsigned long bd_new_entry;
 	int ret = 0;
 
 	/*
 	 * Carve the virtual space out of userspace for the new
 	 * bounds table:
 	 */
-	bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
+	bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
 	if (IS_ERR((void *)bt_addr))
 		return PTR_ERR((void *)bt_addr);
 	/*
 	 * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
 	 */
-	bt_addr = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
+	bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
 
 	/*
 	 * Go poke the address of the new bounds table in to the
@@ -452,8 +494,8 @@ static int allocate_bt(long __user *bd_entry)
 	 * mmap_sem at this point, unlike some of the other part
 	 * of the MPX code that have to pagefault_disable().
 	 */
-	ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
-					   expected_old_val, bt_addr);
+	ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,	bd_entry,
+				   expected_old_val, bd_new_entry);
 	if (ret)
 		goto out_unmap;
 
@@ -481,9 +523,10 @@ static int allocate_bt(long __user *bd_entry)
 		ret = -EINVAL;
 		goto out_unmap;
 	}
+	trace_mpx_new_bounds_table(bt_addr);
 	return 0;
 out_unmap:
-	vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
+	vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
 	return ret;
 }
 
@@ -498,12 +541,13 @@ out_unmap:
  * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
  * and the size of each bound table is 4MB.
  */
-static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
+static int do_mpx_bt_fault(void)
 {
 	unsigned long bd_entry, bd_base;
-	struct bndcsr *bndcsr;
+	const struct bndcsr *bndcsr;
+	struct mm_struct *mm = current->mm;
 
-	bndcsr = get_xsave_addr(xsave_buf, XSTATE_BNDCSR);
+	bndcsr = get_xsave_field_ptr(XSTATE_BNDCSR);
 	if (!bndcsr)
 		return -EINVAL;
 	/*
@@ -520,13 +564,13 @@ static int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
 	 * the directory is.
 	 */
 	if ((bd_entry < bd_base) ||
-	    (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
+	    (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
 		return -EINVAL;
 
-	return allocate_bt((long __user *)bd_entry);
+	return allocate_bt(mm, (long __user *)bd_entry);
 }
 
-int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
+int mpx_handle_bd_fault(void)
 {
 	/*
 	 * Userspace never asked us to manage the bounds tables,
@@ -535,7 +579,7 @@ int mpx_handle_bd_fault(struct xsave_struct *xsave_buf)
 	if (!kernel_managing_mpx_tables(current->mm))
 		return -EINVAL;
 
-	if (do_mpx_bt_fault(xsave_buf)) {
+	if (do_mpx_bt_fault()) {
 		force_sig(SIGSEGV, current);
 		/*
 		 * The force_sig() is essentially "handling" this
@@ -572,29 +616,55 @@ static int mpx_resolve_fault(long __user *addr, int write)
 	return 0;
 }
 
+static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
+					     unsigned long bd_entry)
+{
+	unsigned long bt_addr = bd_entry;
+	int align_to_bytes;
+	/*
+	 * Bit 0 in a bt_entry is always the valid bit.
+	 */
+	bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
+	/*
+	 * Tables are naturally aligned at 8-byte boundaries
+	 * on 64-bit and 4-byte boundaries on 32-bit.  The
+	 * documentation makes it appear that the low bits
+	 * are ignored by the hardware, so we do the same.
+	 */
+	if (is_64bit_mm(mm))
+		align_to_bytes = 8;
+	else
+		align_to_bytes = 4;
+	bt_addr &= ~(align_to_bytes-1);
+	return bt_addr;
+}
+
 /*
  * Get the base of bounds tables pointed by specific bounds
  * directory entry.
  */
 static int get_bt_addr(struct mm_struct *mm,
-			long __user *bd_entry, unsigned long *bt_addr)
+			long __user *bd_entry_ptr,
+			unsigned long *bt_addr_result)
 {
 	int ret;
 	int valid_bit;
+	unsigned long bd_entry;
+	unsigned long bt_addr;
 
-	if (!access_ok(VERIFY_READ, (bd_entry), sizeof(*bd_entry)))
+	if (!access_ok(VERIFY_READ, (bd_entry_ptr), sizeof(*bd_entry_ptr)))
 		return -EFAULT;
 
 	while (1) {
 		int need_write = 0;
 
 		pagefault_disable();
-		ret = get_user(*bt_addr, bd_entry);
+		ret = get_user(bd_entry, bd_entry_ptr);
 		pagefault_enable();
 		if (!ret)
 			break;
 		if (ret == -EFAULT)
-			ret = mpx_resolve_fault(bd_entry, need_write);
+			ret = mpx_resolve_fault(bd_entry_ptr, need_write);
 		/*
 		 * If we could not resolve the fault, consider it
 		 * userspace's fault and error out.
@@ -603,8 +673,8 @@ static int get_bt_addr(struct mm_struct *mm,
 			return ret;
 	}
 
-	valid_bit = *bt_addr & MPX_BD_ENTRY_VALID_FLAG;
-	*bt_addr &= MPX_BT_ADDR_MASK;
+	valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
+	bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
 
 	/*
 	 * When the kernel is managing bounds tables, a bounds directory
@@ -613,7 +683,7 @@ static int get_bt_addr(struct mm_struct *mm,
 	 * data in the address field, we know something is wrong. This
 	 * -EINVAL return will cause a SIGSEGV.
 	 */
-	if (!valid_bit && *bt_addr)
+	if (!valid_bit && bt_addr)
 		return -EINVAL;
 	/*
 	 * Do we have an completely zeroed bt entry?  That is OK.  It
@@ -624,19 +694,100 @@ static int get_bt_addr(struct mm_struct *mm,
 	if (!valid_bit)
 		return -ENOENT;
 
+	*bt_addr_result = bt_addr;
 	return 0;
 }
 
+static inline int bt_entry_size_bytes(struct mm_struct *mm)
+{
+	if (is_64bit_mm(mm))
+		return MPX_BT_ENTRY_BYTES_64;
+	else
+		return MPX_BT_ENTRY_BYTES_32;
+}
+
+/*
+ * Take a virtual address and turns it in to the offset in bytes
+ * inside of the bounds table where the bounds table entry
+ * controlling 'addr' can be found.
+ */
+static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
+		unsigned long addr)
+{
+	unsigned long bt_table_nr_entries;
+	unsigned long offset = addr;
+
+	if (is_64bit_mm(mm)) {
+		/* Bottom 3 bits are ignored on 64-bit */
+		offset >>= 3;
+		bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
+	} else {
+		/* Bottom 2 bits are ignored on 32-bit */
+		offset >>= 2;
+		bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
+	}
+	/*
+	 * We know the size of the table in to which we are
+	 * indexing, and we have eliminated all the low bits
+	 * which are ignored for indexing.
+	 *
+	 * Mask out all the high bits which we do not need
+	 * to index in to the table.  Note that the tables
+	 * are always powers of two so this gives us a proper
+	 * mask.
+	 */
+	offset &= (bt_table_nr_entries-1);
+	/*
+	 * We now have an entry offset in terms of *entries* in
+	 * the table.  We need to scale it back up to bytes.
+	 */
+	offset *= bt_entry_size_bytes(mm);
+	return offset;
+}
+
+/*
+ * How much virtual address space does a single bounds
+ * directory entry cover?
+ *
+ * Note, we need a long long because 4GB doesn't fit in
+ * to a long on 32-bit.
+ */
+static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
+{
+	unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
+	if (is_64bit_mm(mm))
+		return virt_space / MPX_BD_NR_ENTRIES_64;
+	else
+		return virt_space / MPX_BD_NR_ENTRIES_32;
+}
+
 /*
  * Free the backing physical pages of bounds table 'bt_addr'.
  * Assume start...end is within that bounds table.
  */
-static int zap_bt_entries(struct mm_struct *mm,
+static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
 		unsigned long bt_addr,
-		unsigned long start, unsigned long end)
+		unsigned long start_mapping, unsigned long end_mapping)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr, len;
+	unsigned long start;
+	unsigned long end;
+
+	/*
+	 * if we 'end' on a boundary, the offset will be 0 which
+	 * is not what we want.  Back it up a byte to get the
+	 * last bt entry.  Then once we have the entry itself,
+	 * move 'end' back up by the table entry size.
+	 */
+	start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
+	end   = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
+	/*
+	 * Move end back up by one entry.  Among other things
+	 * this ensures that it remains page-aligned and does
+	 * not screw up zap_page_range()
+	 */
+	end += bt_entry_size_bytes(mm);
 
 	/*
 	 * Find the first overlapping vma. If vma->vm_start > start, there
@@ -648,7 +799,7 @@ static int zap_bt_entries(struct mm_struct *mm,
 		return -EINVAL;
 
 	/*
-	 * A NUMA policy on a VM_MPX VMA could cause this bouds table to
+	 * A NUMA policy on a VM_MPX VMA could cause this bounds table to
 	 * be split. So we need to look across the entire 'start -> end'
 	 * range of this bounds table, find all of the VM_MPX VMAs, and
 	 * zap only those.
@@ -666,27 +817,65 @@ static int zap_bt_entries(struct mm_struct *mm,
 
 		len = min(vma->vm_end, end) - addr;
 		zap_page_range(vma, addr, len, NULL);
+		trace_mpx_unmap_zap(addr, addr+len);
 
 		vma = vma->vm_next;
 		addr = vma->vm_start;
 	}
-
 	return 0;
 }
 
-static int unmap_single_bt(struct mm_struct *mm,
+static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
+		unsigned long addr)
+{
+	/*
+	 * There are several ways to derive the bd offsets.  We
+	 * use the following approach here:
+	 * 1. We know the size of the virtual address space
+	 * 2. We know the number of entries in a bounds table
+	 * 3. We know that each entry covers a fixed amount of
+	 *    virtual address space.
+	 * So, we can just divide the virtual address by the
+	 * virtual space used by one entry to determine which
+	 * entry "controls" the given virtual address.
+	 */
+	if (is_64bit_mm(mm)) {
+		int bd_entry_size = 8; /* 64-bit pointer */
+		/*
+		 * Take the 64-bit addressing hole in to account.
+		 */
+		addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
+		return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+	} else {
+		int bd_entry_size = 4; /* 32-bit pointer */
+		/*
+		 * 32-bit has no hole so this case needs no mask
+		 */
+		return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
+	}
+	/*
+	 * The two return calls above are exact copies.  If we
+	 * pull out a single copy and put it in here, gcc won't
+	 * realize that we're doing a power-of-2 divide and use
+	 * shifts.  It uses a real divide.  If we put them up
+	 * there, it manages to figure it out (gcc 4.8.3).
+	 */
+}
+
+static int unmap_entire_bt(struct mm_struct *mm,
 		long __user *bd_entry, unsigned long bt_addr)
 {
 	unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
-	unsigned long actual_old_val = 0;
+	unsigned long uninitialized_var(actual_old_val);
 	int ret;
 
 	while (1) {
 		int need_write = 1;
+		unsigned long cleared_bd_entry = 0;
 
 		pagefault_disable();
-		ret = user_atomic_cmpxchg_inatomic(&actual_old_val, bd_entry,
-						   expected_old_val, 0);
+		ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
+				bd_entry, expected_old_val, cleared_bd_entry);
 		pagefault_enable();
 		if (!ret)
 			break;
@@ -705,9 +894,8 @@ static int unmap_single_bt(struct mm_struct *mm,
 	if (actual_old_val != expected_old_val) {
 		/*
 		 * Someone else raced with us to unmap the table.
-		 * There was no bounds table pointed to by the
-		 * directory, so declare success.  Somebody freed
-		 * it.
+		 * That is OK, since we were both trying to do
+		 * the same thing.  Declare success.
 		 */
 		if (!actual_old_val)
 			return 0;
@@ -720,176 +908,113 @@ static int unmap_single_bt(struct mm_struct *mm,
 		 */
 		return -EINVAL;
 	}
-
 	/*
 	 * Note, we are likely being called under do_munmap() already. To
 	 * avoid recursion, do_munmap() will check whether it comes
 	 * from one bounds table through VM_MPX flag.
 	 */
-	return do_munmap(mm, bt_addr, MPX_BT_SIZE_BYTES);
+	return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm));
 }
 
-/*
- * If the bounds table pointed by bounds directory 'bd_entry' is
- * not shared, unmap this whole bounds table. Otherwise, only free
- * those backing physical pages of bounds table entries covered
- * in this virtual address region start...end.
- */
-static int unmap_shared_bt(struct mm_struct *mm,
-		long __user *bd_entry, unsigned long start,
-		unsigned long end, bool prev_shared, bool next_shared)
+static int try_unmap_single_bt(struct mm_struct *mm,
+	       unsigned long start, unsigned long end)
 {
-	unsigned long bt_addr;
-	int ret;
-
-	ret = get_bt_addr(mm, bd_entry, &bt_addr);
+	struct vm_area_struct *next;
+	struct vm_area_struct *prev;
 	/*
-	 * We could see an "error" ret for not-present bounds
-	 * tables (not really an error), or actual errors, but
-	 * stop unmapping either way.
+	 * "bta" == Bounds Table Area: the area controlled by the
+	 * bounds table that we are unmapping.
 	 */
-	if (ret)
-		return ret;
-
-	if (prev_shared && next_shared)
-		ret = zap_bt_entries(mm, bt_addr,
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
-	else if (prev_shared)
-		ret = zap_bt_entries(mm, bt_addr,
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(start),
-				bt_addr+MPX_BT_SIZE_BYTES);
-	else if (next_shared)
-		ret = zap_bt_entries(mm, bt_addr, bt_addr,
-				bt_addr+MPX_GET_BT_ENTRY_OFFSET(end));
-	else
-		ret = unmap_single_bt(mm, bd_entry, bt_addr);
-
-	return ret;
-}
-
-/*
- * A virtual address region being munmap()ed might share bounds table
- * with adjacent VMAs. We only need to free the backing physical
- * memory of these shared bounds tables entries covered in this virtual
- * address region.
- */
-static int unmap_edge_bts(struct mm_struct *mm,
-		unsigned long start, unsigned long end)
-{
+	unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
+	unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
+	unsigned long uninitialized_var(bt_addr);
+	void __user *bde_vaddr;
 	int ret;
-	long __user *bde_start, *bde_end;
-	struct vm_area_struct *prev, *next;
-	bool prev_shared = false, next_shared = false;
-
-	bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
-	bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
-
 	/*
-	 * Check whether bde_start and bde_end are shared with adjacent
-	 * VMAs.
-	 *
-	 * We already unliked the VMAs from the mm's rbtree so 'start'
+	 * We already unlinked the VMAs from the mm's rbtree so 'start'
 	 * is guaranteed to be in a hole. This gets us the first VMA
 	 * before the hole in to 'prev' and the next VMA after the hole
 	 * in to 'next'.
 	 */
 	next = find_vma_prev(mm, start, &prev);
-	if (prev && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(prev->vm_end-1))
-			== bde_start)
-		prev_shared = true;
-	if (next && (mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(next->vm_start))
-			== bde_end)
-		next_shared = true;
-
 	/*
-	 * This virtual address region being munmap()ed is only
-	 * covered by one bounds table.
-	 *
-	 * In this case, if this table is also shared with adjacent
-	 * VMAs, only part of the backing physical memory of the bounds
-	 * table need be freeed. Otherwise the whole bounds table need
-	 * be unmapped.
-	 */
-	if (bde_start == bde_end) {
-		return unmap_shared_bt(mm, bde_start, start, end,
-				prev_shared, next_shared);
+	 * Do not count other MPX bounds table VMAs as neighbors.
+	 * Although theoretically possible, we do not allow bounds
+	 * tables for bounds tables so our heads do not explode.
+	 * If we count them as neighbors here, we may end up with
+	 * lots of tables even though we have no actual table
+	 * entries in use.
+	 */
+	while (next && is_mpx_vma(next))
+		next = next->vm_next;
+	while (prev && is_mpx_vma(prev))
+		prev = prev->vm_prev;
+	/*
+	 * We know 'start' and 'end' lie within an area controlled
+	 * by a single bounds table.  See if there are any other
+	 * VMAs controlled by that bounds table.  If there are not
+	 * then we can "expand" the are we are unmapping to possibly
+	 * cover the entire table.
+	 */
+	next = find_vma_prev(mm, start, &prev);
+	if ((!prev || prev->vm_end <= bta_start_vaddr) &&
+	    (!next || next->vm_start >= bta_end_vaddr)) {
+		/*
+		 * No neighbor VMAs controlled by same bounds
+		 * table.  Try to unmap the whole thing
+		 */
+		start = bta_start_vaddr;
+		end = bta_end_vaddr;
 	}
 
+	bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
+	ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
 	/*
-	 * If more than one bounds tables are covered in this virtual
-	 * address region being munmap()ed, we need to separately check
-	 * whether bde_start and bde_end are shared with adjacent VMAs.
+	 * No bounds table there, so nothing to unmap.
 	 */
-	ret = unmap_shared_bt(mm, bde_start, start, end, prev_shared, false);
-	if (ret)
-		return ret;
-	ret = unmap_shared_bt(mm, bde_end, start, end, false, next_shared);
+	if (ret == -ENOENT) {
+		ret = 0;
+		return 0;
+	}
 	if (ret)
 		return ret;
-
-	return 0;
+	/*
+	 * We are unmapping an entire table.  Either because the
+	 * unmap that started this whole process was large enough
+	 * to cover an entire table, or that the unmap was small
+	 * but was the area covered by a bounds table.
+	 */
+	if ((start == bta_start_vaddr) &&
+	    (end == bta_end_vaddr))
+		return unmap_entire_bt(mm, bde_vaddr, bt_addr);
+	return zap_bt_entries_mapping(mm, bt_addr, start, end);
 }
 
 static int mpx_unmap_tables(struct mm_struct *mm,
 		unsigned long start, unsigned long end)
 {
-	int ret;
-	long __user *bd_entry, *bde_start, *bde_end;
-	unsigned long bt_addr;
-
-	/*
-	 * "Edge" bounds tables are those which are being used by the region
-	 * (start -> end), but that may be shared with adjacent areas.  If they
-	 * turn out to be completely unshared, they will be freed.  If they are
-	 * shared, we will free the backing store (like an MADV_DONTNEED) for
-	 * areas used by this region.
-	 */
-	ret = unmap_edge_bts(mm, start, end);
-	switch (ret) {
-		/* non-present tables are OK */
-		case 0:
-		case -ENOENT:
-			/* Success, or no tables to unmap */
-			break;
-		case -EINVAL:
-		case -EFAULT:
-		default:
-			return ret;
-	}
-
-	/*
-	 * Only unmap the bounds table that are
-	 *   1. fully covered
-	 *   2. not at the edges of the mapping, even if full aligned
-	 */
-	bde_start = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(start);
-	bde_end = mm->bd_addr + MPX_GET_BD_ENTRY_OFFSET(end-1);
-	for (bd_entry = bde_start + 1; bd_entry < bde_end; bd_entry++) {
-		ret = get_bt_addr(mm, bd_entry, &bt_addr);
-		switch (ret) {
-			case 0:
-				break;
-			case -ENOENT:
-				/* No table here, try the next one */
-				continue;
-			case -EINVAL:
-			case -EFAULT:
-			default:
-				/*
-				 * Note: we are being strict here.
-				 * Any time we run in to an issue
-				 * unmapping tables, we stop and
-				 * SIGSEGV.
-				 */
-				return ret;
-		}
-
-		ret = unmap_single_bt(mm, bd_entry, bt_addr);
+	unsigned long one_unmap_start;
+	trace_mpx_unmap_search(start, end);
+
+	one_unmap_start = start;
+	while (one_unmap_start < end) {
+		int ret;
+		unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
+						       bd_entry_virt_space(mm));
+		unsigned long one_unmap_end = end;
+		/*
+		 * if the end is beyond the current bounds table,
+		 * move it back so we only deal with a single one
+		 * at a time
+		 */
+		if (one_unmap_end > next_unmap_start)
+			one_unmap_end = next_unmap_start;
+		ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
 		if (ret)
 			return ret;
-	}
 
+		one_unmap_start = next_unmap_start;
+	}
 	return 0;
 }
 
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 6629f397b467..8ff686aa7e8c 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -9,6 +9,7 @@
 #include <linux/random.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 89af288ec674..727158cb3b3c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -14,6 +14,7 @@
 #include <linux/percpu.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
+#include <linux/vmalloc.h>
 
 #include <asm/e820.h>
 #include <asm/processor.h>
@@ -129,16 +130,15 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
-	void *vend = vaddr + size - 1;
+	unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
 
 	mb();
 
-	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
-		clflushopt(vaddr);
-	/*
-	 * Flush any possible final partial cacheline:
-	 */
-	clflushopt(vend);
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += boot_cpu_data.x86_clflush_size)
+		clflushopt(p);
 
 	mb();
 }
@@ -418,13 +418,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
 	phys_addr_t phys_addr;
 	unsigned long offset;
 	enum pg_level level;
-	unsigned long psize;
 	unsigned long pmask;
 	pte_t *pte;
 
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
-	psize = page_level_size(level);
 	pmask = page_level_mask(level);
 	offset = virt_addr & ~pmask;
 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1468,6 +1466,9 @@ int _set_memory_uc(unsigned long addr, int numpages)
 {
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
+	 * If you really need strong UC use ioremap_uc(), but note
+	 * that you cannot override IO areas with set_memory_*() as
+	 * these helpers cannot work with IO memory.
 	 */
 	return change_page_attr_set(&addr, numpages,
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1502,12 +1503,10 @@ EXPORT_SYMBOL(set_memory_uc);
 static int _set_memory_array(unsigned long *addr, int addrinarray,
 		enum page_cache_mode new_type)
 {
+	enum page_cache_mode set_type;
 	int i, j;
 	int ret;
 
-	/*
-	 * for now UC MINUS. see comments in ioremap_nocache()
-	 */
 	for (i = 0; i < addrinarray; i++) {
 		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
 					new_type, NULL);
@@ -1515,9 +1514,12 @@ static int _set_memory_array(unsigned long *addr, int addrinarray,
 			goto out_free;
 	}
 
+	/* If WC, set to UC- first and then WC */
+	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+				_PAGE_CACHE_MODE_UC_MINUS : new_type;
+
 	ret = change_page_attr_set(addr, addrinarray,
-				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
-				   1);
+				   cachemode2pgprot(set_type), 1);
 
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(addr, addrinarray,
@@ -1549,6 +1551,12 @@ int set_memory_array_wc(unsigned long *addr, int addrinarray)
 }
 EXPORT_SYMBOL(set_memory_array_wc);
 
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
 int _set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
@@ -1571,27 +1579,42 @@ int set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
 
-	if (!pat_enabled)
-		return set_memory_uc(addr, numpages);
-
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 		_PAGE_CACHE_MODE_WC, NULL);
 	if (ret)
-		goto out_err;
+		return ret;
 
 	ret = _set_memory_wc(addr, numpages);
 	if (ret)
-		goto out_free;
-
-	return 0;
+		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 
-out_free:
-	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_err:
 	return ret;
 }
 EXPORT_SYMBOL(set_memory_wc);
 
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+	return change_page_attr_set(&addr, numpages,
+				    cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+	int ret;
+
+	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+			      _PAGE_CACHE_MODE_WT, NULL);
+	if (ret)
+		return ret;
+
+	ret = _set_memory_wt(addr, numpages);
+	if (ret)
+		free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
 int _set_memory_wb(unsigned long addr, int numpages)
 {
 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
@@ -1682,6 +1705,7 @@ static int _set_pages_array(struct page **pages, int addrinarray,
 {
 	unsigned long start;
 	unsigned long end;
+	enum page_cache_mode set_type;
 	int i;
 	int free_idx;
 	int ret;
@@ -1695,8 +1719,12 @@ static int _set_pages_array(struct page **pages, int addrinarray,
 			goto err_out;
 	}
 
+	/* If WC, set to UC- first and then WC */
+	set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+				_PAGE_CACHE_MODE_UC_MINUS : new_type;
+
 	ret = cpa_set_pages_array(pages, addrinarray,
-			cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+				  cachemode2pgprot(set_type));
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(NULL, addrinarray,
 					       cachemode2pgprot(
@@ -1730,6 +1758,12 @@ int set_pages_array_wc(struct page **pages, int addrinarray)
 }
 EXPORT_SYMBOL(set_pages_array_wc);
 
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 35af6771a95a..188e3e07eeeb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -33,13 +33,17 @@
 #include "pat_internal.h"
 #include "mm_internal.h"
 
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool boot_cpu_done;
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
 
 static inline void pat_disable(const char *reason)
 {
-	pat_enabled = 0;
-	printk(KERN_INFO "%s\n", reason);
+	__pat_enabled = 0;
+	pr_info("x86/PAT: %s\n", reason);
 }
 
 static int __init nopat(char *str)
@@ -48,13 +52,12 @@ static int __init nopat(char *str)
 	return 0;
 }
 early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
 {
-	(void)reason;
+	return !!__pat_enabled;
 }
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
 
 int pat_debug_enable;
 
@@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *str)
 }
 __setup("debugpat", pat_debug_setup);
 
-static u64 __read_mostly boot_pat_state;
-
 #ifdef CONFIG_X86_PAT
 /*
- * X86 PAT uses page flags WC and Uncached together to keep track of
- * memory type of pages that have backing page struct. X86 PAT supports 3
- * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
- * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
- * been changed from its default (value of -1 used to denote this).
- * Note we do not support _PAGE_CACHE_MODE_UC here.
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ *  - _PAGE_CACHE_MODE_WB
+ *  - _PAGE_CACHE_MODE_WC
+ *  - _PAGE_CACHE_MODE_UC_MINUS
+ *  - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
  */
 
-#define _PGMT_DEFAULT		0
+#define _PGMT_WB		0
 #define _PGMT_WC		(1UL << PG_arch_1)
 #define _PGMT_UC_MINUS		(1UL << PG_uncached)
-#define _PGMT_WB		(1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_WT		(1UL << PG_uncached | 1UL << PG_arch_1)
 #define _PGMT_MASK		(1UL << PG_uncached | 1UL << PG_arch_1)
 #define _PGMT_CLEAR_MASK	(~_PGMT_MASK)
 
@@ -88,14 +93,14 @@ static inline enum page_cache_mode get_page_memtype(struct page *pg)
 {
 	unsigned long pg_flags = pg->flags & _PGMT_MASK;
 
-	if (pg_flags == _PGMT_DEFAULT)
-		return -1;
+	if (pg_flags == _PGMT_WB)
+		return _PAGE_CACHE_MODE_WB;
 	else if (pg_flags == _PGMT_WC)
 		return _PAGE_CACHE_MODE_WC;
 	else if (pg_flags == _PGMT_UC_MINUS)
 		return _PAGE_CACHE_MODE_UC_MINUS;
 	else
-		return _PAGE_CACHE_MODE_WB;
+		return _PAGE_CACHE_MODE_WT;
 }
 
 static inline void set_page_memtype(struct page *pg,
@@ -112,11 +117,12 @@ static inline void set_page_memtype(struct page *pg,
 	case _PAGE_CACHE_MODE_UC_MINUS:
 		memtype_flags = _PGMT_UC_MINUS;
 		break;
-	case _PAGE_CACHE_MODE_WB:
-		memtype_flags = _PGMT_WB;
+	case _PAGE_CACHE_MODE_WT:
+		memtype_flags = _PGMT_WT;
 		break;
+	case _PAGE_CACHE_MODE_WB:
 	default:
-		memtype_flags = _PGMT_DEFAULT;
+		memtype_flags = _PGMT_WB;
 		break;
 	}
 
@@ -174,78 +180,154 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
  * configuration.
  * Using lower indices is preferred, so we start with highest index.
  */
-void pat_init_cache_modes(void)
+void pat_init_cache_modes(u64 pat)
 {
-	int i;
 	enum page_cache_mode cache;
 	char pat_msg[33];
-	u64 pat;
+	int i;
 
-	rdmsrl(MSR_IA32_CR_PAT, pat);
 	pat_msg[32] = 0;
 	for (i = 7; i >= 0; i--) {
 		cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
 					   pat_msg + 4 * i);
 		update_cache_mode_entry(i, cache);
 	}
-	pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 }
 
 #define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 
-void pat_init(void)
+static void pat_bsp_init(u64 pat)
 {
-	u64 pat;
-	bool boot_cpu = !boot_pat_state;
+	u64 tmp_pat;
 
-	if (!pat_enabled)
+	if (!cpu_has_pat) {
+		pat_disable("PAT not supported by CPU.");
 		return;
+	}
 
-	if (!cpu_has_pat) {
-		if (!boot_pat_state) {
-			pat_disable("PAT not supported by CPU.");
-			return;
-		} else {
-			/*
-			 * If this happens we are on a secondary CPU, but
-			 * switched to PAT on the boot CPU. We have no way to
-			 * undo PAT.
-			 */
-			printk(KERN_ERR "PAT enabled, "
-			       "but not supported by secondary CPU\n");
-			BUG();
-		}
+	if (!pat_enabled())
+		goto done;
+
+	rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+	if (!tmp_pat) {
+		pat_disable("PAT MSR is 0, disabled.");
+		return;
 	}
 
-	/* Set PWT to Write-Combining. All other bits stay the same */
-	/*
-	 * PTE encoding used in Linux:
-	 *      PAT
-	 *      |PCD
-	 *      ||PWT
-	 *      |||
-	 *      000 WB		_PAGE_CACHE_WB
-	 *      001 WC		_PAGE_CACHE_WC
-	 *      010 UC-		_PAGE_CACHE_UC_MINUS
-	 *      011 UC		_PAGE_CACHE_UC
-	 * PAT bit unused
-	 */
-	pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
-	      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
-
-	/* Boot CPU check */
-	if (!boot_pat_state) {
-		rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
-		if (!boot_pat_state) {
-			pat_disable("PAT read returns always zero, disabled.");
-			return;
-		}
+	wrmsrl(MSR_IA32_CR_PAT, pat);
+
+done:
+	pat_init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+	if (!pat_enabled())
+		return;
+
+	if (!cpu_has_pat) {
+		/*
+		 * If this happens we are on a secondary CPU, but switched to
+		 * PAT on the boot CPU. We have no way to undo PAT.
+		 */
+		panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 	}
 
 	wrmsrl(MSR_IA32_CR_PAT, pat);
+}
+
+void pat_init(void)
+{
+	u64 pat;
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (!pat_enabled()) {
+		/*
+		 * No PAT. Emulate the PAT table that corresponds to the two
+		 * cache bits, PWT (Write Through) and PCD (Cache Disable). This
+		 * setup is the same as the BIOS default setup when the system
+		 * has PAT but the "nopat" boot option has been specified. This
+		 * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+		 *
+		 * PTE encoding:
+		 *
+		 *       PCD
+		 *       |PWT  PAT
+		 *       ||    slot
+		 *       00    0    WB : _PAGE_CACHE_MODE_WB
+		 *       01    1    WT : _PAGE_CACHE_MODE_WT
+		 *       10    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *       11    3    UC : _PAGE_CACHE_MODE_UC
+		 *
+		 * NOTE: When WC or WP is used, it is redirected to UC- per
+		 * the default setup in __cachemode2pte_tbl[].
+		 */
+		pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
 
-	if (boot_cpu)
-		pat_init_cache_modes();
+	} else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+		   (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+		    ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+		/*
+		 * PAT support with the lower four entries. Intel Pentium 2,
+		 * 3, M, and 4 are affected by PAT errata, which makes the
+		 * upper four entries unusable. To be on the safe side, we don't
+		 * use those.
+		 *
+		 *  PTE encoding:
+		 *      PAT
+		 *      |PCD
+		 *      ||PWT  PAT
+		 *      |||    slot
+		 *      000    0    WB : _PAGE_CACHE_MODE_WB
+		 *      001    1    WC : _PAGE_CACHE_MODE_WC
+		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *      011    3    UC : _PAGE_CACHE_MODE_UC
+		 * PAT bit unused
+		 *
+		 * NOTE: When WT or WP is used, it is redirected to UC- per
+		 * the default setup in __cachemode2pte_tbl[].
+		 */
+		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+	} else {
+		/*
+		 * Full PAT support.  We put WT in slot 7 to improve
+		 * robustness in the presence of errata that might cause
+		 * the high PAT bit to be ignored.  This way, a buggy slot 7
+		 * access will hit slot 3, and slot 3 is UC, so at worst
+		 * we lose performance without causing a correctness issue.
+		 * Pentium 4 erratum N46 is an example for such an erratum,
+		 * although we try not to use PAT at all on affected CPUs.
+		 *
+		 *  PTE encoding:
+		 *      PAT
+		 *      |PCD
+		 *      ||PWT  PAT
+		 *      |||    slot
+		 *      000    0    WB : _PAGE_CACHE_MODE_WB
+		 *      001    1    WC : _PAGE_CACHE_MODE_WC
+		 *      010    2    UC-: _PAGE_CACHE_MODE_UC_MINUS
+		 *      011    3    UC : _PAGE_CACHE_MODE_UC
+		 *      100    4    WB : Reserved
+		 *      101    5    WC : Reserved
+		 *      110    6    UC-: Reserved
+		 *      111    7    WT : _PAGE_CACHE_MODE_WT
+		 *
+		 * The reserved slots are unused, but mapped to their
+		 * corresponding types in the presence of PAT errata.
+		 */
+		pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+		      PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+	}
+
+	if (!boot_cpu_done) {
+		pat_bsp_init(pat);
+		boot_cpu_done = true;
+	} else {
+		pat_ap_init(pat);
+	}
 }
 
 #undef PAT
@@ -267,9 +349,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 	 * request is for WB.
 	 */
 	if (req_type == _PAGE_CACHE_MODE_WB) {
-		u8 mtrr_type;
+		u8 mtrr_type, uniform;
 
-		mtrr_type = mtrr_type_lookup(start, end);
+		mtrr_type = mtrr_type_lookup(start, end, &uniform);
 		if (mtrr_type != MTRR_TYPE_WRBACK)
 			return _PAGE_CACHE_MODE_UC_MINUS;
 
@@ -324,9 +406,14 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
 
 /*
  * For RAM pages, we use page flags to mark the pages with appropriate type.
- * Here we do two pass:
- * - Find the memtype of all the pages in the range, look for any conflicts
- * - In case of no conflicts, set the new memtype for pages in the range
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-.  Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
  */
 static int reserve_ram_pages_type(u64 start, u64 end,
 				  enum page_cache_mode req_type,
@@ -335,6 +422,12 @@ static int reserve_ram_pages_type(u64 start, u64 end,
 	struct page *page;
 	u64 pfn;
 
+	if (req_type == _PAGE_CACHE_MODE_WP) {
+		if (new_type)
+			*new_type = _PAGE_CACHE_MODE_UC_MINUS;
+		return -EINVAL;
+	}
+
 	if (req_type == _PAGE_CACHE_MODE_UC) {
 		/* We do not support strong UC */
 		WARN_ON_ONCE(1);
@@ -346,8 +439,8 @@ static int reserve_ram_pages_type(u64 start, u64 end,
 
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
-		if (type != -1) {
-			pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+		if (type != _PAGE_CACHE_MODE_WB) {
+			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 				start, end - 1, type, req_type);
 			if (new_type)
 				*new_type = type;
@@ -373,7 +466,7 @@ static int free_ram_pages_type(u64 start, u64 end)
 
 	for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
 		page = pfn_to_page(pfn);
-		set_page_memtype(page, -1);
+		set_page_memtype(page, _PAGE_CACHE_MODE_WB);
 	}
 	return 0;
 }
@@ -384,6 +477,7 @@ static int free_ram_pages_type(u64 start, u64 end)
  * - _PAGE_CACHE_MODE_WC
  * - _PAGE_CACHE_MODE_UC_MINUS
  * - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
  *
  * If new_type is NULL, function will return an error if it cannot reserve the
  * region with req_type. If new_type is non-NULL, function will return
@@ -400,14 +494,10 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 	BUG_ON(start >= end); /* end is exclusive */
 
-	if (!pat_enabled) {
+	if (!pat_enabled()) {
 		/* This is identical to page table setting without PAT */
-		if (new_type) {
-			if (req_type == _PAGE_CACHE_MODE_WC)
-				*new_type = _PAGE_CACHE_MODE_UC_MINUS;
-			else
-				*new_type = req_type;
-		}
+		if (new_type)
+			*new_type = req_type;
 		return 0;
 	}
 
@@ -451,9 +541,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
-		       start, end - 1,
-		       cattr_name(new->type), cattr_name(req_type));
+		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+			start, end - 1,
+			cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		spin_unlock(&memtype_lock);
 
@@ -475,7 +565,7 @@ int free_memtype(u64 start, u64 end)
 	int is_range_ram;
 	struct memtype *entry;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Low ISA region is always mapped WB. No need to track */
@@ -497,8 +587,8 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
-		       current->comm, current->pid, start, end - 1);
+		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+			current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 	}
 
@@ -517,7 +607,7 @@ int free_memtype(u64 start, u64 end)
  * Only to be called when PAT is enabled
  *
  * Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
- * or _PAGE_CACHE_MODE_UC
+ * or _PAGE_CACHE_MODE_WT.
  */
 static enum page_cache_mode lookup_memtype(u64 paddr)
 {
@@ -529,16 +619,9 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
 
 	if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
 		struct page *page;
-		page = pfn_to_page(paddr >> PAGE_SHIFT);
-		rettype = get_page_memtype(page);
-		/*
-		 * -1 from get_page_memtype() implies RAM page is in its
-		 * default state and not reserved, and hence of type WB
-		 */
-		if (rettype == -1)
-			rettype = _PAGE_CACHE_MODE_WB;
 
-		return rettype;
+		page = pfn_to_page(paddr >> PAGE_SHIFT);
+		return get_page_memtype(page);
 	}
 
 	spin_lock(&memtype_lock);
@@ -623,13 +706,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 	u64 to = from + size;
 	u64 cursor = from;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 1;
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-			       current->comm, from, to - 1);
+			pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+				current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
@@ -659,7 +742,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	 * caching for the high addresses through the KEN pin, but
 	 * we maintain the tradition of paranoia in this code.
 	 */
-	if (!pat_enabled &&
+	if (!pat_enabled() &&
 	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -698,8 +781,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
 				size;
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
-		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
-			"for [mem %#010Lx-%#010Lx]\n",
+		pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			cattr_name(pcm),
 			base, (unsigned long long)(base + size-1));
@@ -729,12 +811,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	 * the type requested matches the type of first page in the range.
 	 */
 	if (is_ram) {
-		if (!pat_enabled)
+		if (!pat_enabled())
 			return 0;
 
 		pcm = lookup_memtype(paddr);
 		if (want_pcm != pcm) {
-			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				cattr_name(want_pcm),
 				(unsigned long long)paddr,
@@ -755,13 +837,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		if (strict_prot ||
 		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 			free_memtype(paddr, paddr + size);
-			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for [mem %#010Lx-%#010Lx], got %s\n",
-				current->comm, current->pid,
-				cattr_name(want_pcm),
-				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size - 1),
-				cattr_name(pcm));
+			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+			       current->comm, current->pid,
+			       cattr_name(want_pcm),
+			       (unsigned long long)paddr,
+			       (unsigned long long)(paddr + size - 1),
+			       cattr_name(pcm));
 			return -EINVAL;
 		}
 		/*
@@ -844,7 +925,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		return ret;
 	}
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/*
@@ -872,7 +953,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 {
 	enum page_cache_mode pcm;
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 
 	/* Set prot based on lookup */
@@ -913,14 +994,18 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
-	if (pat_enabled)
-		return __pgprot(pgprot_val(prot) |
+	return __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
-	else
-		return pgprot_noncached(prot);
 }
 EXPORT_SYMBOL_GPL(pgprot_writecombine);
 
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+	return __pgprot(pgprot_val(prot) |
+				cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
 
 static struct memtype *memtype_get_idx(loff_t pos)
@@ -996,7 +1081,7 @@ static const struct file_operations memtype_fops = {
 
 static int __init pat_memtype_list_init(void)
 {
-	if (pat_enabled) {
+	if (pat_enabled()) {
 		debugfs_create_file("pat_memtype_list", S_IRUSR,
 				    arch_debugfs_dir, NULL, &memtype_fops);
 	}
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index f6411620305d..a739bfc40690 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -4,7 +4,7 @@
 extern int pat_debug_enable;
 
 #define dprintk(fmt, arg...) \
-	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+	do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
 
 struct memtype {
 	u64			start;
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 6582adcc8bd9..63931080366a 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -160,9 +160,9 @@ success:
 	return 0;
 
 failure:
-	printk(KERN_INFO "%s:%d conflicting memory types "
-		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
-		end, cattr_name(found_type), cattr_name(match->type));
+	pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+		current->comm, current->pid, start, end,
+		cattr_name(found_type), cattr_name(match->type));
 	return -EBUSY;
 }
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0b97d2c75df3..fb0a9dd1d6e4 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 }
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ *   has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK))
 		return 0;
 
 	prot = pgprot_4k_2_large(prot);
@@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK)) {
+		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+			     __func__, addr, addr + PMD_SIZE);
 		return 0;
+	}
 
 	prot = pgprot_4k_2_large(prot);
 
@@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	return 1;
 }
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
 int pud_clear_huge(pud_t *pud)
 {
 	if (pud_large(*pud)) {
@@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud)
 	return 0;
 }
 
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
 int pmd_clear_huge(pmd_t *pmd)
 {
 	if (pmd_large(*pmd)) {
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 6440221ced0d..4093216b3791 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -8,7 +8,6 @@
  * of the License.
  */
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
 
 /*
  * Calling convention :
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 99f76103c6b7..579a8fd74be0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
+#include <linux/bpf.h>
 
 int bpf_jit_enable __read_mostly;
 
@@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
 	return ptr + len;
 }
 
-#define EMIT(bytes, len)	do { prog = emit_code(prog, bytes, len); } while (0)
+#define EMIT(bytes, len) \
+	do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
 
 #define EMIT1(b1)		EMIT(b1, 1)
 #define EMIT2(b1, b2)		EMIT((b1) + ((b2) << 8), 2)
@@ -186,31 +188,31 @@ struct jit_context {
 #define BPF_MAX_INSN_SIZE	128
 #define BPF_INSN_SAFETY		64
 
-static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
-		  int oldproglen, struct jit_context *ctx)
+#define STACKSIZE \
+	(MAX_BPF_STACK + \
+	 32 /* space for rbx, r13, r14, r15 */ + \
+	 8 /* space for skb_copy_bits() buffer */)
+
+#define PROLOGUE_SIZE 51
+
+/* emit x64 prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog)
 {
-	struct bpf_insn *insn = bpf_prog->insnsi;
-	int insn_cnt = bpf_prog->len;
-	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
-	bool seen_exit = false;
-	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
-	int i;
-	int proglen = 0;
-	u8 *prog = temp;
-	int stacksize = MAX_BPF_STACK +
-		32 /* space for rbx, r13, r14, r15 */ +
-		8 /* space for skb_copy_bits() buffer */;
+	u8 *prog = *pprog;
+	int cnt = 0;
 
 	EMIT1(0x55); /* push rbp */
 	EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
 
-	/* sub rsp, stacksize */
-	EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+	/* sub rsp, STACKSIZE */
+	EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE);
 
 	/* all classic BPF filters use R6(rbx) save it */
 
 	/* mov qword ptr [rbp-X],rbx */
-	EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+	EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE);
 
 	/* bpf_convert_filter() maps classic BPF register X to R7 and uses R8
 	 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and
@@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 	 */
 
 	/* mov qword ptr [rbp-X],r13 */
-	EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+	EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8);
 	/* mov qword ptr [rbp-X],r14 */
-	EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+	EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16);
 	/* mov qword ptr [rbp-X],r15 */
-	EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+	EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24);
 
 	/* clear A and X registers */
 	EMIT2(0x31, 0xc0); /* xor eax, eax */
 	EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */
 
+	/* clear tail_cnt: mov qword ptr [rbp-X], rax */
+	EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32);
+
+	BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+	*pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ *   if (index >= array->map.max_entries)
+ *     goto out;
+ *   if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ *     goto out;
+ *   prog = array->prog[index];
+ *   if (prog == NULL)
+ *     goto out;
+ *   goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+	u8 *prog = *pprog;
+	int label1, label2, label3;
+	int cnt = 0;
+
+	/* rdi - pointer to ctx
+	 * rsi - pointer to bpf_array
+	 * rdx - index in bpf_array
+	 */
+
+	/* if (index >= array->map.max_entries)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	      offsetof(struct bpf_array, map.max_entries));
+	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
+#define OFFSET1 44 /* number of bytes to jump */
+	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
+	label1 = cnt;
+
+	/* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+	 *   goto out;
+	 */
+	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
+	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
+#define OFFSET2 33
+	EMIT2(X86_JA, OFFSET2);                   /* ja out */
+	label2 = cnt;
+	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
+	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
+
+	/* prog = array->prog[index]; */
+	EMIT4(0x48, 0x8D, 0x44, 0xD6);            /* lea rax, [rsi + rdx * 8 + 0x50] */
+	EMIT1(offsetof(struct bpf_array, prog));
+	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
+
+	/* if (prog == NULL)
+	 *   goto out;
+	 */
+	EMIT4(0x48, 0x83, 0xF8, 0x00);            /* cmp rax, 0 */
+#define OFFSET3 10
+	EMIT2(X86_JE, OFFSET3);                   /* je out */
+	label3 = cnt;
+
+	/* goto *(prog->bpf_func + prologue_size); */
+	EMIT4(0x48, 0x8B, 0x40,                   /* mov rax, qword ptr [rax + 32] */
+	      offsetof(struct bpf_prog, bpf_func));
+	EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE);   /* add rax, prologue_size */
+
+	/* now we're ready to jump into next BPF program
+	 * rdi == ctx (1st arg)
+	 * rax == prog->bpf_func + prologue_size
+	 */
+	EMIT2(0xFF, 0xE0);                        /* jmp rax */
+
+	/* out: */
+	BUILD_BUG_ON(cnt - label1 != OFFSET1);
+	BUILD_BUG_ON(cnt - label2 != OFFSET2);
+	BUILD_BUG_ON(cnt - label3 != OFFSET3);
+	*pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+		  int oldproglen, struct jit_context *ctx)
+{
+	struct bpf_insn *insn = bpf_prog->insnsi;
+	int insn_cnt = bpf_prog->len;
+	bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0);
+	bool seen_exit = false;
+	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	int i, cnt = 0;
+	int proglen = 0;
+	u8 *prog = temp;
+
+	emit_prologue(&prog);
+
 	if (seen_ld_abs) {
 		/* r9d : skb->len - skb->data_len (headlen)
 		 * r10 : skb->data
@@ -739,6 +837,10 @@ xadd:			if (is_imm8(insn->off))
 			}
 			break;
 
+		case BPF_JMP | BPF_CALL | BPF_X:
+			emit_bpf_tail_call(&prog);
+			break;
+
 			/* cond jump */
 		case BPF_JMP | BPF_JEQ | BPF_X:
 		case BPF_JMP | BPF_JNE | BPF_X:
@@ -891,13 +993,13 @@ common_load:
 			/* update cleanup_addr */
 			ctx->cleanup_addr = proglen;
 			/* mov rbx, qword ptr [rbp-X] */
-			EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+			EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE);
 			/* mov r13, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+			EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8);
 			/* mov r14, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+			EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16);
 			/* mov r15, qword ptr [rbp-X] */
-			EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+			EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24);
 
 			EMIT1(0xC9); /* leave */
 			EMIT1(0xC3); /* ret */
@@ -966,7 +1068,12 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 	ctx.cleanup_addr = proglen;
 
-	for (pass = 0; pass < 10; pass++) {
+	/* JITed image shrinks with every pass and the loop iterates
+	 * until the image stops shrinking. Very large bpf programs
+	 * may converge on the last pass. In such case do one more
+	 * pass to emit the final image
+	 */
+	for (pass = 0; pass < 10 || image; pass++) {
 		proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
 		if (proglen <= 0) {
 			image = NULL;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d93963340c3c..ff9911707160 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -81,6 +81,17 @@ static const struct dmi_system_id pci_crs_quirks[] __initconst = {
 			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
 		},
 	},
+	/* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/931368 */
+	/* https://bugs.launchpad.net/ubuntu/+source/alsa-driver/+bug/1033299 */
+	{
+		.callback = set_use_crs,
+		.ident = "Foxconn K8M890-8237A",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Foxconn"),
+			DMI_MATCH(DMI_BOARD_NAME, "K8M890-8237A"),
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+		},
+	},
 
 	/* Now for the blacklist.. */
 
@@ -121,8 +132,10 @@ void __init pci_acpi_crs_quirks(void)
 {
 	int year;
 
-	if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
-		pci_use_crs = false;
+	if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) {
+		if (iomem_resource.end <= 0xffffffff)
+			pci_use_crs = false;
+	}
 
 	dmi_check_system(pci_crs_quirks);
 
@@ -482,9 +495,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 
 int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
-	struct pci_sysdata *sd = bridge->bus->sysdata;
-
-	ACPI_COMPANION_SET(&bridge->dev, sd->companion);
+	/*
+	 * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL
+	 * here, pci_create_root_bus() has been called by someone else and
+	 * sysdata is likely to be different from what we expect.  Let it go in
+	 * that case.
+	 */
+	if (!bridge->dev.parent) {
+		struct pci_sysdata *sd = bridge->bus->sysdata;
+		ACPI_COMPANION_SET(&bridge->dev, sd->companion);
+	}
 	return 0;
 }
 
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 349c0d32cc0b..0a9f2caf358f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
  	 * is a free mtrr slot.
  	 */
-	if (!pat_enabled && write_combine)
+	if (!pat_enabled() && write_combine)
 		return -EINVAL;
 
-	if (pat_enabled && write_combine)
+	if (pat_enabled() && write_combine)
 		prot |= cachemode2protval(_PAGE_CACHE_MODE_WC);
-	else if (pat_enabled || boot_cpu_data.x86 > 3)
+	else if (pat_enabled() || boot_cpu_data.x86 > 3)
 		/*
 		 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
 		 * To avoid attribute conflicts, request UC MINUS here
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 852aa4c92da0..27062303c881 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
+	struct irq_alloc_info info;
 	int polarity;
 
 	if (dev->irq_managed && dev->irq > 0)
@@ -217,14 +218,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 		polarity = 0; /* active high */
 	else
 		polarity = 1; /* active low */
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
 
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 */
-	if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
-		return -EBUSY;
-	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0)
 		return -EBUSY;
 
 	dev->irq_managed = 1;
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5dc6ca5e1741..9bd115484745 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void)
 
 /*
  *  Code for querying and setting of IRQ routes on various interrupt routers.
+ *  PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1.
  */
 
-void eisa_set_level_irq(unsigned int irq)
+void elcr_set_level_irq(unsigned int irq)
 {
 	unsigned char mask = 1 << (irq & 7);
 	unsigned int port = 0x4d0 + (irq >> 3);
 	unsigned char val;
-	static u16 eisa_irq_mask;
+	static u16 elcr_irq_mask;
 
-	if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+	if (irq >= 16 || (1 << irq) & elcr_irq_mask)
 		return;
 
-	eisa_irq_mask |= (1 << irq);
+	elcr_irq_mask |= (1 << irq);
 	printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
 	val = inb(port);
 	if (!(val & mask)) {
@@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
 	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
 		msg = "found";
-		eisa_set_level_irq(irq);
+		elcr_set_level_irq(irq);
 	} else if (newirq && r->set &&
 		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
-			eisa_set_level_irq(newirq);
+			elcr_set_level_irq(newirq);
 			msg = "assigned";
 			irq = newirq;
 		}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index a62e0be3a2f1..f1a6c8e86ddd 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,4 +1,5 @@
 # Platform specific code goes here
+obj-y	+= atom/
 obj-y	+= ce4100/
 obj-y	+= efi/
 obj-y	+= geode/
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
new file mode 100644
index 000000000000..0a3a40cbc794
--- /dev/null
+++ b/arch/x86/platform/atom/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
new file mode 100644
index 000000000000..5ca8ead91579
--- /dev/null
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -0,0 +1,183 @@
+/*
+ * Intel SOC Punit device state debug driver
+ * Punit controls power management for North Complex devices (Graphics
+ * blocks, Image Signal Processing, video processing, display, DSP etc.)
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include <asm/cpu_device_id.h>
+#include <asm/iosf_mbi.h>
+
+/* Side band Interface port */
+#define PUNIT_PORT		0x04
+/* Power gate status reg */
+#define PWRGT_STATUS		0x61
+/* Subsystem config/status Video processor */
+#define VED_SS_PM0		0x32
+/* Subsystem config/status ISP (Image Signal Processor) */
+#define ISP_SS_PM0		0x39
+/* Subsystem config/status Input/output controller */
+#define MIO_SS_PM		0x3B
+/* Shift bits for getting status for video, isp and i/o */
+#define SSS_SHIFT		24
+/* Shift bits for getting status for graphics rendering */
+#define RENDER_POS		0
+/* Shift bits for getting status for media control */
+#define MEDIA_POS		2
+/* Shift bits for getting status for Valley View/Baytrail display */
+#define VLV_DISPLAY_POS		6
+/* Subsystem config/status display for Cherry Trail SOC */
+#define CHT_DSP_SSS		0x36
+/* Shift bits for getting status for display */
+#define CHT_DSP_SSS_POS		16
+
+struct punit_device {
+	char *name;
+	int reg;
+	int sss_pos;
+};
+
+static const struct punit_device punit_device_byt[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	PWRGT_STATUS,	VLV_DISPLAY_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const struct punit_device punit_device_cht[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	CHT_DSP_SSS,	CHT_DSP_SSS_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
+{
+	u32 punit_pwr_status;
+	struct punit_device *punit_devp = seq_file->private;
+	int index;
+	int status;
+
+	seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
+	while (punit_devp->name) {
+		status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
+				       punit_devp->reg,
+				       &punit_pwr_status);
+		if (status) {
+			seq_printf(seq_file, "%9s : Read Failed\n",
+				   punit_devp->name);
+		} else  {
+			index = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+			seq_printf(seq_file, "%9s : %s\n", punit_devp->name,
+				   dstates[index]);
+		}
+		punit_devp++;
+	}
+
+	return 0;
+}
+
+static int punit_dev_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, punit_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations punit_dev_state_ops = {
+	.open		= punit_dev_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *punit_dbg_file;
+
+static int punit_dbgfs_register(struct punit_device *punit_device)
+{
+	static struct dentry *dev_state;
+
+	punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
+	if (!punit_dbg_file)
+		return -ENXIO;
+
+	dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO,
+					punit_dbg_file, punit_device,
+					&punit_dev_state_ops);
+	if (!dev_state) {
+		pr_err("punit_dev_state register failed\n");
+		debugfs_remove(punit_dbg_file);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static void punit_dbgfs_unregister(void)
+{
+	debugfs_remove_recursive(punit_dbg_file);
+}
+
+#define ICPU(model, drv_data) \
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\
+	  (kernel_ulong_t)&drv_data }
+
+static const struct x86_cpu_id intel_punit_cpu_ids[] = {
+	ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */
+	ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
+
+static int __init punit_atom_debug_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_punit_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	ret = punit_dbgfs_register((struct punit_device *)id->driver_data);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void __exit punit_atom_debug_exit(void)
+{
+	punit_dbgfs_unregister();
+}
+
+module_init(punit_atom_debug_init);
+module_exit(punit_atom_debug_exit);
+
+MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_DESCRIPTION("Driver for Punit devices states debugging");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 02744df576d5..3b984c3aa1b0 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -501,6 +501,8 @@ void __init efi_init(void)
 
 	if (efi_enabled(EFI_DBG))
 		print_efi_memmap();
+
+	efi_esrt_init();
 }
 
 void __init efi_late_init(void)
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
index 0b283d4d0ad7..de734134bc8d 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_wdt.c
@@ -27,6 +27,7 @@ static struct platform_device wdt_dev = {
 static int tangier_probe(struct platform_device *pdev)
 {
 	int gsi;
+	struct irq_alloc_info info;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
 
 	if (!pdata)
@@ -34,8 +35,8 @@ static int tangier_probe(struct platform_device *pdev)
 
 	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
 	gsi = pdata->irq;
-	if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
-	    mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
+	ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
+	if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) {
 		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
 			 gsi);
 		return -EINVAL;
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 3005f0c89f2e..01d54ea766c1 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void)
 	return 0;
 }
 
+static void __init intel_mid_setup_bp_timer(void)
+{
+	apbt_time_init();
+	setup_boot_APIC_clock();
+}
+
 static void __init intel_mid_time_init(void)
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+
 	switch (intel_mid_timer_options) {
 	case INTEL_MID_TIMER_APBT_ONLY:
 		break;
 	case INTEL_MID_TIMER_LAPIC_APBT:
-		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		/* Use apbt and local apic */
+		x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-		break;
+		return;
 	default:
 		if (!boot_cpu_has(X86_FEATURE_ARAT))
 			break;
+		/* Lapic only, no apbt */
 		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
 		return;
 	}
-	/* we need at least one APB timer */
-	pre_init_apic_IRQ0();
-	apbt_time_init();
+
+	x86_init.timers.setup_percpu_clockev = apbt_time_init;
 }
 
 static void intel_mid_arch_setup(void)
diff --git a/arch/x86/platform/intel-mid/sfi.c b/arch/x86/platform/intel-mid/sfi.c
index c14ad34776c4..ce992e8cc065 100644
--- a/arch/x86/platform/intel-mid/sfi.c
+++ b/arch/x86/platform/intel-mid/sfi.c
@@ -95,18 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table)
 		pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
 			totallen, (u32)pentry->phys_addr,
 			pentry->freq_hz, pentry->irq);
-			if (!pentry->irq)
-				continue;
-			mp_irq.type = MP_INTSRC;
-			mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-			mp_irq.irqflag = 5;
-			mp_irq.srcbus = MP_BUS_ISA;
-			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
-			mp_irq.dstapic = MP_APIC_ALL;
-			mp_irq.dstirq = pentry->irq;
-			mp_save_irq(&mp_irq);
-			mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_irq.type = MP_INTSRC;
+		mp_irq.irqtype = mp_INT;
+		/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+		mp_irq.irqflag = 5;
+		mp_irq.srcbus = MP_BUS_ISA;
+		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
+		mp_irq.dstapic = MP_APIC_ALL;
+		mp_irq.dstirq = pentry->irq;
+		mp_save_irq(&mp_irq);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 
 	return 0;
@@ -177,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 		mp_irq.dstapic = MP_APIC_ALL;
 		mp_irq.dstirq = pentry->irq;
 		mp_save_irq(&mp_irq);
-		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 	return 0;
 }
@@ -436,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	struct devs_id *dev = NULL;
 	int num, i, ret;
 	int polarity;
+	struct irq_alloc_info info;
 
 	sb = (struct sfi_table_simple *)table;
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -469,9 +468,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 				polarity = 1;
 			}
 
-			ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
-			if (ret == 0)
-				ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+			ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
+			ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
 			WARN_ON(ret < 0);
 		}
 
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index 2a8a74f3bd76..6c7111bbd1e9 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -25,8 +25,8 @@
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 
+#include <asm/irqdomain.h>
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
@@ -71,9 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_IO_APIC
-static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-};
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
@@ -82,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	int i, num;
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_STRICT,
-		.ops = &sfi_ioapic_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 
 	sb = (struct sfi_table_simple *)table;
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 0ce673645432..8570abe68be1 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -13,22 +13,37 @@
 #include <linux/slab.h>
 #include <linux/irq.h>
 
+#include <asm/irqdomain.h>
 #include <asm/apic.h>
 #include <asm/uv/uv_irq.h>
 #include <asm/uv/uv_hub.h>
 
 /* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
-	struct rb_node		list;
+struct uv_irq_2_mmr_pnode {
 	unsigned long		offset;
 	int			pnode;
-	int			irq;
 };
 
-static DEFINE_SPINLOCK(uv_irq_lock);
-static struct rb_root		uv_irq_root;
+static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
+{
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+		     sizeof(unsigned long));
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= cfg->dest_apicid;
 
-static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
+	uv_write_global_mmr64(info->pnode, info->offset, mmr_value);
+}
 
 static void uv_noop(struct irq_data *data) { }
 
@@ -37,6 +52,23 @@ static void uv_ack_apic(struct irq_data *data)
 	ack_APIC_irq();
 }
 
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	struct irq_data *parent = data->parent_data;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		uv_program_mmr(cfg, data->chip_data);
+		send_cleanup_vector(cfg);
+	}
+
+	return ret;
+}
+
 static struct irq_chip uv_irq_chip = {
 	.name			= "UV-CORE",
 	.irq_mask		= uv_noop,
@@ -45,189 +77,99 @@ static struct irq_chip uv_irq_chip = {
 	.irq_set_affinity	= uv_set_irq_affinity,
 };
 
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs, void *arg)
 {
-	struct rb_node **link = &uv_irq_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct uv_irq_2_mmr_pnode *n;
-	struct uv_irq_2_mmr_pnode *e;
-	unsigned long irqflags;
-
-	n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
-				uv_blade_to_memory_nid(blade));
-	if (!n)
+	struct uv_irq_2_mmr_pnode *chip_data;
+	struct irq_alloc_info *info = arg;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+	int ret;
+
+	if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV)
+		return -EINVAL;
+
+	chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
+				 irq_data->node);
+	if (!chip_data)
 		return -ENOMEM;
 
-	n->irq = irq;
-	n->offset = offset;
-	n->pnode = uv_blade_to_pnode(blade);
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	/* Find the right place in the rbtree: */
-	while (*link) {
-		parent = *link;
-		e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-
-		if (unlikely(irq == e->irq)) {
-			/* irq entry exists */
-			e->pnode = uv_blade_to_pnode(blade);
-			e->offset = offset;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			kfree(n);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			link = &(*link)->rb_left;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret >= 0) {
+		if (info->uv_limit == UV_AFFINITY_CPU)
+			irq_set_status_flags(virq, IRQ_NO_BALANCING);
 		else
-			link = &(*link)->rb_right;
+			irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+
+		chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
+		chip_data->offset = info->uv_offset;
+		irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
+				    handle_percpu_irq, NULL, info->uv_name);
+	} else {
+		kfree(chip_data);
 	}
 
-	/* Insert the node into the rbtree. */
-	rb_link_node(&n->list, parent, link);
-	rb_insert_color(&n->list, &uv_irq_root);
-
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return 0;
+	return ret;
 }
 
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs)
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-
-		if (e->irq == irq) {
-			*offset = e->offset;
-			*pnode = e->pnode;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return -1;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_clear_status_flags(virq, IRQ_NO_BALANCING);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
 /*
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int limit)
+static void uv_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data)
 {
-	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_cfg *cfg = irq_cfg(irq);
-	unsigned long mmr_value;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode, err;
-	unsigned int dest;
-
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
-	err = assign_irq_vector(irq, cfg, eligible_cpu);
-	if (err != 0)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
-	if (err != 0)
-		return err;
-
-	if (limit == UV_AFFINITY_CPU)
-		irq_set_status_flags(irq, IRQ_NO_BALANCING);
-	else
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-
-	irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
-				      irq_name);
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	return irq;
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 
 /*
  * Disable the specified MMR located on the specified blade so that MSIs are
  * longer allowed to be sent.
  */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+static void uv_domain_deactivate(struct irq_domain *domain,
+				 struct irq_data *irq_data)
 {
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	entry->mask = 1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 
-static int
-uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
-{
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
-	unsigned long mmr_value, mmr_offset;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode;
-
-	if (apic_set_affinity(data, mask, &dest))
-		return -1;
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	/* Get previously stored MMR and pnode of hub sourcing interrupts */
-	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
-		return -1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+static const struct irq_domain_ops uv_domain_ops = {
+	.alloc		= uv_domain_alloc,
+	.free		= uv_domain_free,
+	.activate	= uv_domain_activate,
+	.deactivate	= uv_domain_deactivate,
+};
 
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
+static struct irq_domain *uv_get_irq_domain(void)
+{
+	static struct irq_domain *uv_domain;
+	static DEFINE_MUTEX(uv_lock);
+
+	mutex_lock(&uv_lock);
+	if (uv_domain == NULL) {
+		uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL);
+		if (uv_domain)
+			uv_domain->parent = x86_vector_domain;
+	}
+	mutex_unlock(&uv_lock);
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return uv_domain;
 }
 
 /*
@@ -238,19 +180,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		 unsigned long mmr_offset, int limit)
 {
-	int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade));
+	struct irq_alloc_info info;
+	struct irq_domain *domain = uv_get_irq_domain();
 
-	if (!irq)
-		return -EBUSY;
+	if (!domain)
+		return -ENOMEM;
 
-	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		limit);
-	if (ret == irq)
-		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
-	else
-		irq_free_hwirq(irq);
+	init_irq_alloc_info(&info, cpumask_of(cpu));
+	info.type = X86_IRQ_ALLOC_TYPE_UV;
+	info.uv_limit = limit;
+	info.uv_blade = mmr_blade;
+	info.uv_offset = mmr_offset;
+	info.uv_name = irq_name;
 
-	return ret;
+	return irq_domain_alloc_irqs(domain, 1,
+				     uv_blade_to_memory_nid(mmr_blade), &info);
 }
 EXPORT_SYMBOL_GPL(uv_setup_irq);
 
@@ -263,26 +207,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
  */
 void uv_teardown_irq(unsigned int irq)
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-		if (e->irq == irq) {
-			arch_disable_uv_irq(e->pnode, e->offset);
-			rb_erase(n, &uv_irq_root);
-			kfree(e);
-			break;
-		}
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	irq_free_hwirq(irq);
+	irq_domain_free_irqs(irq, 1);
 }
 EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 757678fb26e1..0d7dd1f5ac36 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,10 +18,9 @@
 #include <asm/mtrr.h>
 #include <asm/page.h>
 #include <asm/mce.h>
-#include <asm/xcr.h>
 #include <asm/suspend.h>
+#include <asm/fpu/internal.h>
 #include <asm/debugreg.h>
-#include <asm/fpu-internal.h> /* pcntxt_mask */
 #include <asm/cpu.h>
 
 #ifdef CONFIG_X86_32
@@ -155,6 +154,8 @@ static void fix_processor_context(void)
 #endif
 	load_TR_desc();				/* This does ltr */
 	load_LDT(&current->active_mm->context);	/* This does lldt */
+
+	fpu__resume_cpu();
 }
 
 /**
@@ -221,12 +222,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
 	wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
 #endif
 
-	/*
-	 * restore XCR0 for xsave capable cpu's.
-	 */
-	if (cpu_has_xsave)
-		xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
-
 	fix_processor_context();
 
 	do_fpu_end();
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index 3c4469a7a929..e2386cb4e0c3 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -78,9 +78,9 @@ ENTRY(restore_image)
 
 	/* code below has been relocated to a safe page */
 ENTRY(core_restore_code)
-loop:
+.Lloop:
 	testq	%rdx, %rdx
-	jz	done
+	jz	.Ldone
 
 	/* get addresses from the pbe and copy the page */
 	movq	pbe_address(%rdx), %rsi
@@ -91,8 +91,8 @@ loop:
 
 	/* progress to the next pbe */
 	movq	pbe_next(%rdx), %rdx
-	jmp	loop
-done:
+	jmp	.Lloop
+.Ldone:
 	/* jump to the restore_registers address from the image header */
 	jmpq	*%rax
 	/*
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index acb384d24669..a8fecc226946 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -26,7 +26,7 @@ else
 
 obj-y += syscalls_64.o vdso/
 
-subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \
+subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \
 		../lib/rwsem.o
 
 endif
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 7e8a1a650435..b9531d343134 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -39,7 +39,8 @@
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
 
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 46957ead3060..0b95c9b8283f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1181,10 +1181,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_tscp = native_read_tscp,
 
 	.iret = xen_iret,
-	.irq_enable_sysexit = xen_sysexit,
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret64 = xen_sysret64,
+#else
+	.irq_enable_sysexit = xen_sysexit,
 #endif
 
 	.load_tr_desc = paravirt_nop,
@@ -1423,7 +1424,7 @@ static void xen_pvh_set_cr_flags(int cpu)
 		return;
 	/*
 	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
-	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu_init.
+	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
 	*/
 	if (cpu_has_pse)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
@@ -1467,6 +1468,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 {
 	struct physdev_set_iopl set_iopl;
 	unsigned long initrd_start = 0;
+	u64 pat;
 	int rc;
 
 	if (!xen_start_info)
@@ -1574,8 +1576,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	 * Modify the cache mode translation tables to match Xen's PAT
 	 * configuration.
 	 */
-
-	pat_init_cache_modes();
+	rdmsrl(MSR_IA32_CR_PAT, pat);
+	pat_init_cache_modes(pat);
 
 	/* keep using Xen gdt for now; no urgent need to change it */
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b47124d4cd67..8b7f18e200aa 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -67,6 +67,7 @@
 #include <linux/seq_file.h>
 #include <linux/bootmem.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cache.h>
 #include <asm/setup.h>
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 956374c1edbc..9e2ba5c6e1dd 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,6 +17,56 @@
 #include "xen-ops.h"
 #include "debugfs.h"
 
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void xen_qlock_kick(int cpu)
+{
+	xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+	int irq = __this_cpu_read(lock_kicker_irq);
+
+	/* If kicker interrupts not initialized yet, just spin */
+	if (irq == -1)
+		return;
+
+	/* clear pending */
+	xen_clear_irq_pending(irq);
+	barrier();
+
+	/*
+	 * We check the byte value after clearing pending IRQ to make sure
+	 * that we won't miss a wakeup event because of the clearing.
+	 *
+	 * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
+	 * So it is effectively a memory barrier for x86.
+	 */
+	if (READ_ONCE(*byte) != val)
+		return;
+
+	/*
+	 * If an interrupt happens here, it will leave the wakeup irq
+	 * pending, which will cause xen_poll_irq() to return
+	 * immediately.
+	 */
+
+	/* Block until irq becomes pending (or perhaps a spurious wakeup) */
+	xen_poll_irq(irq);
+}
+
+#else /* CONFIG_QUEUED_SPINLOCKS */
+
 enum xen_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -100,12 +150,9 @@ struct xen_lock_waiting {
 	__ticket_t want;
 };
 
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
 
-static bool xen_pvspin = true;
 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
 	int irq = __this_cpu_read(lock_kicker_irq);
@@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
 		}
 	}
 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
@@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void)
 		return;
 	}
 	printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+#ifdef CONFIG_QUEUED_SPINLOCKS
+	__pv_init_lock_hash();
+	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+	pv_lock_ops.wait = xen_qlock_wait;
+	pv_lock_ops.kick = xen_qlock_kick;
+#else
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
 	pv_lock_ops.unlock_kick = xen_unlock_kick;
+#endif
 }
 
 /*
@@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg)
 }
 early_param("xen_nopvspin", xen_parse_nopvspin);
 
-#ifdef CONFIG_XEN_DEBUG_FS
+#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
 
 static struct dentry *d_spin_debug;
 
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 985fc3ee0973..f22667abf7b9 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -15,6 +15,8 @@
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
 
 #include <xen/interface/xen.h>
 
@@ -47,29 +49,13 @@ ENTRY(xen_iret)
 ENDPATCH(xen_iret)
 RELOC(xen_iret, 1b+1)
 
-/*
- * sysexit is not used for 64-bit processes, so it's only ever used to
- * return to 32-bit compat userspace.
- */
-ENTRY(xen_sysexit)
-	pushq $__USER32_DS
-	pushq %rcx
-	pushq $X86_EFLAGS_IF
-	pushq $__USER32_CS
-	pushq %rdx
-
-	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysexit)
-RELOC(xen_sysexit, 1b+1)
-
 ENTRY(xen_sysret64)
 	/*
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	pushq $__USER_DS
 	pushq PER_CPU_VAR(rsp_scratch)
@@ -88,7 +74,7 @@ ENTRY(xen_sysret32)
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	pushq $__USER32_DS
 	pushq PER_CPU_VAR(rsp_scratch)
@@ -128,7 +114,7 @@ RELOC(xen_sysret32, 1b+1)
 /* Normal 64-bit system call target */
 ENTRY(xen_syscall_target)
 	undo_xen_syscall
-	jmp system_call_after_swapgs
+	jmp entry_SYSCALL_64_after_swapgs
 ENDPROC(xen_syscall_target)
 
 #ifdef CONFIG_IA32_EMULATION
@@ -136,13 +122,13 @@ ENDPROC(xen_syscall_target)
 /* 32-bit compat syscall target */
 ENTRY(xen_syscall32_target)
 	undo_xen_syscall
-	jmp ia32_cstar_target
+	jmp entry_SYSCALL_compat
 ENDPROC(xen_syscall32_target)
 
 /* 32-bit compat sysenter target */
 ENTRY(xen_sysenter_target)
 	undo_xen_syscall
-	jmp ia32_sysenter_target
+	jmp entry_SYSENTER_compat
 ENDPROC(xen_sysenter_target)
 
 #else /* !CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9e195c683549..c20fe29e65f4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
+#ifdef CONFIG_X86_32
 __visible void xen_sysexit(void);
+#endif
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
 __visible void xen_adjust_exception_frame(void);
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 87be10e8b57a..e5b872ba2484 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -151,7 +151,7 @@ config HAVE_SMP
 	  the CPU core definition and currently needs to be selected manually.
 
 	  Multiprocessor support in implemented with external cache and
-	  interrupt controlers.
+	  interrupt controllers.
 
 	  The MX interrupt distributer adds Interprocessor Interrupts
 	  and causes the IRQ numbers to be increased by 4 for devices
diff --git a/arch/xtensa/include/asm/dma-mapping.h b/arch/xtensa/include/asm/dma-mapping.h
index 172a02a6ad14..ba78ccf651e7 100644
--- a/arch/xtensa/include/asm/dma-mapping.h
+++ b/arch/xtensa/include/asm/dma-mapping.h
@@ -185,4 +185,17 @@ static inline int dma_get_sgtable(struct device *dev, struct sg_table *sgt,
 	return -EINVAL;
 }
 
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+				    dma_addr_t *dma_handle, gfp_t flag,
+				    struct dma_attrs *attrs)
+{
+	return NULL;
+}
+
+static inline void dma_free_attrs(struct device *dev, size_t size,
+				  void *vaddr, dma_addr_t dma_handle,
+				  struct dma_attrs *attrs)
+{
+}
+
 #endif	/* _XTENSA_DMA_MAPPING_H */
diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h
index fe1600a09438..c39bb6e61911 100644
--- a/arch/xtensa/include/asm/io.h
+++ b/arch/xtensa/include/asm/io.h
@@ -59,6 +59,7 @@ static inline void __iomem *ioremap_cache(unsigned long offset,
 }
 
 #define ioremap_wc ioremap_nocache
+#define ioremap_wt ioremap_nocache
 
 static inline void __iomem *ioremap(unsigned long offset, unsigned long size)
 {
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 9e3571a6535c..83a44a33cfa1 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -15,10 +15,10 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/hardirq.h>
-#include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 
 DEFINE_PER_CPU(unsigned long, asid_cache) = ASID_USER_FIRST;
@@ -57,7 +57,7 @@ void do_page_fault(struct pt_regs *regs)
 	/* If we're in an interrupt or have no user
 	 * context, we must not take the fault..
 	 */
-	if (in_atomic() || !mm) {
+	if (faulthandler_disabled() || !mm) {
 		bad_page_fault(regs, address, SIGSEGV);
 		return;
 	}
diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c
index 8cfb71ec0937..184ceadccc1a 100644
--- a/arch/xtensa/mm/highmem.c
+++ b/arch/xtensa/mm/highmem.c
@@ -42,6 +42,7 @@ void *kmap_atomic(struct page *page)
 	enum fixed_addresses idx;
 	unsigned long vaddr;
 
+	preempt_disable();
 	pagefault_disable();
 	if (!PageHighMem(page))
 		return page_address(page);
@@ -79,6 +80,7 @@ void __kunmap_atomic(void *kvaddr)
 	}
 
 	pagefault_enable();
+	preempt_enable();
 }
 EXPORT_SYMBOL(__kunmap_atomic);