29 files changed, 1101 insertions, 1188 deletions
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 2795618e4f07..64dc1ad59801 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -82,7 +82,7 @@ config 32BIT
 
 config PMB_ENABLE
 	bool "Support 32-bit physical addressing through PMB"
-	depends on MMU && EXPERIMENTAL && (CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785)
+	depends on MMU && EXPERIMENTAL && (CPU_SUBTYPE_SH7757 || CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785)
 	select 32BIT
 	default y
 	help
@@ -97,7 +97,7 @@ choice
 
 config PMB
 	bool "PMB"
-	depends on MMU && EXPERIMENTAL && (CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785)
+	depends on MMU && EXPERIMENTAL && (CPU_SUBTYPE_SH7757 || CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785)
 	select 32BIT
 	help
 	  If you say Y here, physical addressing will be extended to
@@ -106,7 +106,8 @@ config PMB
 
 config PMB_FIXED
 	bool "fixed PMB"
-	depends on MMU && EXPERIMENTAL && (CPU_SUBTYPE_SH7780 || \
+	depends on MMU && EXPERIMENTAL && (CPU_SUBTYPE_SH7757 || \
+					   CPU_SUBTYPE_SH7780 || \
 					   CPU_SUBTYPE_SH7785)
 	select 32BIT
 	help
diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile
index 9f4bc3d90b1e..3759bf853293 100644
--- a/arch/sh/mm/Makefile
+++ b/arch/sh/mm/Makefile
@@ -1,5 +1,65 @@
-ifeq ($(CONFIG_SUPERH32),y)
-include ${srctree}/arch/sh/mm/Makefile_32
-else
-include ${srctree}/arch/sh/mm/Makefile_64
+#
+# Makefile for the Linux SuperH-specific parts of the memory manager.
+#
+
+obj-y			:= cache.o init.o consistent.o mmap.o
+
+cacheops-$(CONFIG_CPU_SH2)		:= cache-sh2.o
+cacheops-$(CONFIG_CPU_SH2A)		:= cache-sh2a.o
+cacheops-$(CONFIG_CPU_SH3)		:= cache-sh3.o
+cacheops-$(CONFIG_CPU_SH4)		:= cache-sh4.o flush-sh4.o
+cacheops-$(CONFIG_CPU_SH5)		:= cache-sh5.o flush-sh4.o
+cacheops-$(CONFIG_SH7705_CACHE_32KB)	+= cache-sh7705.o
+
+obj-y			+= $(cacheops-y)
+
+mmu-y			:= nommu.o extable_32.o
+mmu-$(CONFIG_MMU)	:= extable_$(BITS).o fault_$(BITS).o \
+			   ioremap_$(BITS).o kmap.o tlbflush_$(BITS).o
+
+obj-y			+= $(mmu-y)
+obj-$(CONFIG_DEBUG_FS)	+= asids-debugfs.o
+
+ifdef CONFIG_DEBUG_FS
+obj-$(CONFIG_CPU_SH4)	+= cache-debugfs.o
 endif
+
+ifdef CONFIG_MMU
+tlb-$(CONFIG_CPU_SH3)		:= tlb-sh3.o
+tlb-$(CONFIG_CPU_SH4)		:= tlb-sh4.o
+tlb-$(CONFIG_CPU_SH5)		:= tlb-sh5.o
+tlb-$(CONFIG_CPU_HAS_PTEAEX)	:= tlb-pteaex.o
+obj-y				+= $(tlb-y)
+endif
+
+obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
+obj-$(CONFIG_PMB)		+= pmb.o
+obj-$(CONFIG_PMB_FIXED)		+= pmb-fixed.o
+obj-$(CONFIG_NUMA)		+= numa.o
+
+# Special flags for fault_64.o.  This puts restrictions on the number of
+# caller-save registers that the compiler can target when building this file.
+# This is required because the code is called from a context in entry.S where
+# very few registers have been saved in the exception handler (for speed
+# reasons).
+# The caller save registers that have been saved and which can be used are
+# r2,r3,r4,r5 : argument passing
+# r15, r18 : SP and LINK
+# tr0-4 : allow all caller-save TR's.  The compiler seems to be able to make
+#         use of them, so it's probably beneficial to performance to save them
+#         and have them available for it.
+#
+# The resources not listed below are callee save, i.e. the compiler is free to
+# use any of them and will spill them to the stack itself.
+
+CFLAGS_fault_64.o += -ffixed-r7 \
+	-ffixed-r8 -ffixed-r9 -ffixed-r10 -ffixed-r11 -ffixed-r12 \
+	-ffixed-r13 -ffixed-r14 -ffixed-r16 -ffixed-r17 -ffixed-r19 \
+	-ffixed-r20 -ffixed-r21 -ffixed-r22 -ffixed-r23 \
+	-ffixed-r24 -ffixed-r25 -ffixed-r26 -ffixed-r27 \
+	-ffixed-r36 -ffixed-r37 -ffixed-r38 -ffixed-r39 -ffixed-r40 \
+	-ffixed-r41 -ffixed-r42 -ffixed-r43  \
+	-ffixed-r60 -ffixed-r61 -ffixed-r62 \
+	-fomit-frame-pointer
+
+EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/mm/Makefile_32 b/arch/sh/mm/Makefile_32
deleted file mode 100644
index 986a1e055834..000000000000
--- a/arch/sh/mm/Makefile_32
+++ /dev/null
@@ -1,43 +0,0 @@
-#
-# Makefile for the Linux SuperH-specific parts of the memory manager.
-#
-
-obj-y			:= init.o extable_32.o consistent.o mmap.o
-
-ifndef CONFIG_CACHE_OFF
-cache-$(CONFIG_CPU_SH2)		:= cache-sh2.o
-cache-$(CONFIG_CPU_SH2A)	:= cache-sh2a.o
-cache-$(CONFIG_CPU_SH3)		:= cache-sh3.o
-cache-$(CONFIG_CPU_SH4)		:= cache-sh4.o
-cache-$(CONFIG_SH7705_CACHE_32KB)	+= cache-sh7705.o
-endif
-
-obj-y			+= $(cache-y)
-
-mmu-y			:= tlb-nommu.o pg-nommu.o
-mmu-$(CONFIG_MMU)	:= fault_32.o tlbflush_32.o ioremap_32.o
-
-obj-y			+= $(mmu-y)
-obj-$(CONFIG_DEBUG_FS)	+= asids-debugfs.o
-
-ifdef CONFIG_DEBUG_FS
-obj-$(CONFIG_CPU_SH4)	+= cache-debugfs.o
-endif
-
-ifdef CONFIG_MMU
-tlb-$(CONFIG_CPU_SH3)		:= tlb-sh3.o
-tlb-$(CONFIG_CPU_SH4)		:= tlb-sh4.o
-tlb-$(CONFIG_CPU_HAS_PTEAEX)	:= tlb-pteaex.o
-obj-y				+= $(tlb-y)
-ifndef CONFIG_CACHE_OFF
-obj-$(CONFIG_CPU_SH4)		+= pg-sh4.o
-obj-$(CONFIG_SH7705_CACHE_32KB)	+= pg-sh7705.o
-endif
-endif
-
-obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_PMB)		+= pmb.o
-obj-$(CONFIG_PMB_FIXED)		+= pmb-fixed.o
-obj-$(CONFIG_NUMA)		+= numa.o
-
-EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/mm/Makefile_64 b/arch/sh/mm/Makefile_64
deleted file mode 100644
index 2863ffb7006d..000000000000
--- a/arch/sh/mm/Makefile_64
+++ /dev/null
@@ -1,46 +0,0 @@
-#
-# Makefile for the Linux SuperH-specific parts of the memory manager.
-#
-
-obj-y			:= init.o consistent.o mmap.o
-
-mmu-y			:= tlb-nommu.o pg-nommu.o extable_32.o
-mmu-$(CONFIG_MMU)	:= fault_64.o ioremap_64.o tlbflush_64.o tlb-sh5.o \
-			   extable_64.o
-
-ifndef CONFIG_CACHE_OFF
-obj-y			+= cache-sh5.o
-endif
-
-obj-y			+= $(mmu-y)
-obj-$(CONFIG_DEBUG_FS)	+= asids-debugfs.o
-
-obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
-obj-$(CONFIG_NUMA)		+= numa.o
-
-EXTRA_CFLAGS += -Werror
-
-# Special flags for fault_64.o.  This puts restrictions on the number of
-# caller-save registers that the compiler can target when building this file.
-# This is required because the code is called from a context in entry.S where
-# very few registers have been saved in the exception handler (for speed
-# reasons).
-# The caller save registers that have been saved and which can be used are
-# r2,r3,r4,r5 : argument passing
-# r15, r18 : SP and LINK
-# tr0-4 : allow all caller-save TR's.  The compiler seems to be able to make
-#         use of them, so it's probably beneficial to performance to save them
-#         and have them available for it.
-#
-# The resources not listed below are callee save, i.e. the compiler is free to
-# use any of them and will spill them to the stack itself.
-
-CFLAGS_fault_64.o += -ffixed-r7 \
-	-ffixed-r8 -ffixed-r9 -ffixed-r10 -ffixed-r11 -ffixed-r12 \
-	-ffixed-r13 -ffixed-r14 -ffixed-r16 -ffixed-r17 -ffixed-r19 \
-	-ffixed-r20 -ffixed-r21 -ffixed-r22 -ffixed-r23 \
-	-ffixed-r24 -ffixed-r25 -ffixed-r26 -ffixed-r27 \
-	-ffixed-r36 -ffixed-r37 -ffixed-r38 -ffixed-r39 -ffixed-r40 \
-	-ffixed-r41 -ffixed-r42 -ffixed-r43  \
-	-ffixed-r60 -ffixed-r61 -ffixed-r62 \
-	-fomit-frame-pointer
diff --git a/arch/sh/mm/cache-sh2.c b/arch/sh/mm/cache-sh2.c
index c4e80d2b764b..699a71f46327 100644
--- a/arch/sh/mm/cache-sh2.c
+++ b/arch/sh/mm/cache-sh2.c
@@ -16,7 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/io.h>
 
-void __flush_wback_region(void *start, int size)
+static void sh2__flush_wback_region(void *start, int size)
 {
 	unsigned long v;
 	unsigned long begin, end;
@@ -37,7 +37,7 @@ void __flush_wback_region(void *start, int size)
 	}
 }
 
-void __flush_purge_region(void *start, int size)
+static void sh2__flush_purge_region(void *start, int size)
 {
 	unsigned long v;
 	unsigned long begin, end;
@@ -51,7 +51,7 @@ void __flush_purge_region(void *start, int size)
 			  CACHE_OC_ADDRESS_ARRAY | (v & 0x00000ff0) | 0x00000008);
 }
 
-void __flush_invalidate_region(void *start, int size)
+static void sh2__flush_invalidate_region(void *start, int size)
 {
 #ifdef CONFIG_CACHE_WRITEBACK
 	/*
@@ -82,3 +82,10 @@ void __flush_invalidate_region(void *start, int size)
 			  CACHE_OC_ADDRESS_ARRAY | (v & 0x00000ff0) | 0x00000008);
 #endif
 }
+
+void __init sh2_cache_init(void)
+{
+	__flush_wback_region		= sh2__flush_wback_region;
+	__flush_purge_region		= sh2__flush_purge_region;
+	__flush_invalidate_region	= sh2__flush_invalidate_region;
+}
diff --git a/arch/sh/mm/cache-sh2a.c b/arch/sh/mm/cache-sh2a.c
index 24d86a794065..975899d83564 100644
--- a/arch/sh/mm/cache-sh2a.c
+++ b/arch/sh/mm/cache-sh2a.c
@@ -15,7 +15,7 @@
 #include <asm/cacheflush.h>
 #include <asm/io.h>
 
-void __flush_wback_region(void *start, int size)
+static void sh2a__flush_wback_region(void *start, int size)
 {
 	unsigned long v;
 	unsigned long begin, end;
@@ -44,7 +44,7 @@ void __flush_wback_region(void *start, int size)
 	local_irq_restore(flags);
 }
 
-void __flush_purge_region(void *start, int size)
+static void sh2a__flush_purge_region(void *start, int size)
 {
 	unsigned long v;
 	unsigned long begin, end;
@@ -65,7 +65,7 @@ void __flush_purge_region(void *start, int size)
 	local_irq_restore(flags);
 }
 
-void __flush_invalidate_region(void *start, int size)
+static void sh2a__flush_invalidate_region(void *start, int size)
 {
 	unsigned long v;
 	unsigned long begin, end;
@@ -97,13 +97,15 @@ void __flush_invalidate_region(void *start, int size)
 }
 
 /* WBack O-Cache and flush I-Cache */
-void flush_icache_range(unsigned long start, unsigned long end)
+static void sh2a_flush_icache_range(void *args)
 {
+	struct flusher_data *data = args;
+	unsigned long start, end;
 	unsigned long v;
 	unsigned long flags;
 
-	start = start & ~(L1_CACHE_BYTES-1);
-	end = (end + L1_CACHE_BYTES-1) & ~(L1_CACHE_BYTES-1);
+	start = data->addr1 & ~(L1_CACHE_BYTES-1);
+	end = (data->addr2 + L1_CACHE_BYTES-1) & ~(L1_CACHE_BYTES-1);
 
 	local_irq_save(flags);
 	jump_to_uncached();
@@ -127,3 +129,12 @@ void flush_icache_range(unsigned long start, unsigned long end)
 	back_to_cached();
 	local_irq_restore(flags);
 }
+
+void __init sh2a_cache_init(void)
+{
+	local_flush_icache_range	= sh2a_flush_icache_range;
+
+	__flush_wback_region		= sh2a__flush_wback_region;
+	__flush_purge_region		= sh2a__flush_purge_region;
+	__flush_invalidate_region	= sh2a__flush_invalidate_region;
+}
diff --git a/arch/sh/mm/cache-sh3.c b/arch/sh/mm/cache-sh3.c
index 6d1dbec08ad4..faef80c98134 100644
--- a/arch/sh/mm/cache-sh3.c
+++ b/arch/sh/mm/cache-sh3.c
@@ -32,7 +32,7 @@
  * SIZE: Size of the region.
  */
 
-void __flush_wback_region(void *start, int size)
+static void sh3__flush_wback_region(void *start, int size)
 {
 	unsigned long v, j;
 	unsigned long begin, end;
@@ -71,7 +71,7 @@ void __flush_wback_region(void *start, int size)
  * START: Virtual Address (U0, P1, or P3)
  * SIZE: Size of the region.
  */
-void __flush_purge_region(void *start, int size)
+static void sh3__flush_purge_region(void *start, int size)
 {
 	unsigned long v;
 	unsigned long begin, end;
@@ -90,11 +90,16 @@ void __flush_purge_region(void *start, int size)
 	}
 }
 
-/*
- * No write back please
- *
- * Except I don't think there's any way to avoid the writeback. So we
- * just alias it to __flush_purge_region(). dwmw2.
- */
-void __flush_invalidate_region(void *start, int size)
-	__attribute__((alias("__flush_purge_region")));
+void __init sh3_cache_init(void)
+{
+	__flush_wback_region = sh3__flush_wback_region;
+	__flush_purge_region = sh3__flush_purge_region;
+
+	/*
+	 * No write back please
+	 *
+	 * Except I don't think there's any way to avoid the writeback.
+	 * So we just alias it to sh3__flush_purge_region(). dwmw2.
+	 */
+	__flush_invalidate_region = sh3__flush_purge_region;
+}
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 5cfe08dbb59e..b2453bbef4cd 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/mutex.h>
+#include <linux/fs.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
@@ -25,13 +26,6 @@
 #define MAX_DCACHE_PAGES	64	/* XXX: Tune for ways */
 #define MAX_ICACHE_PAGES	32
 
-static void __flush_dcache_segment_1way(unsigned long start,
-					unsigned long extent);
-static void __flush_dcache_segment_2way(unsigned long start,
-					unsigned long extent);
-static void __flush_dcache_segment_4way(unsigned long start,
-					unsigned long extent);
-
 static void __flush_cache_4096(unsigned long addr, unsigned long phys,
 			       unsigned long exec_offset);
 
@@ -43,182 +37,56 @@ static void __flush_cache_4096(unsigned long addr, unsigned long phys,
 static void (*__flush_dcache_segment_fn)(unsigned long, unsigned long) =
 	(void (*)(unsigned long, unsigned long))0xdeadbeef;
 
-static void compute_alias(struct cache_info *c)
+/*
+ * Write back the range of D-cache, and purge the I-cache.
+ *
+ * Called from kernel/module.c:sys_init_module and routine for a.out format,
+ * signal handler code and kprobes code
+ */
+static void sh4_flush_icache_range(void *args)
 {
-	c->alias_mask = ((c->sets - 1) << c->entry_shift) & ~(PAGE_SIZE - 1);
-	c->n_aliases = c->alias_mask ? (c->alias_mask >> PAGE_SHIFT) + 1 : 0;
-}
+	struct flusher_data *data = args;
+	unsigned long start, end;
+	unsigned long flags, v;
+	int i;
 
-static void __init emit_cache_params(void)
-{
-	printk("PVR=%08x CVR=%08x PRR=%08x\n",
-		ctrl_inl(CCN_PVR),
-		ctrl_inl(CCN_CVR),
-		ctrl_inl(CCN_PRR));
-	printk("I-cache : n_ways=%d n_sets=%d way_incr=%d\n",
-		boot_cpu_data.icache.ways,
-		boot_cpu_data.icache.sets,
-		boot_cpu_data.icache.way_incr);
-	printk("I-cache : entry_mask=0x%08x alias_mask=0x%08x n_aliases=%d\n",
-		boot_cpu_data.icache.entry_mask,
-		boot_cpu_data.icache.alias_mask,
-		boot_cpu_data.icache.n_aliases);
-	printk("D-cache : n_ways=%d n_sets=%d way_incr=%d\n",
-		boot_cpu_data.dcache.ways,
-		boot_cpu_data.dcache.sets,
-		boot_cpu_data.dcache.way_incr);
-	printk("D-cache : entry_mask=0x%08x alias_mask=0x%08x n_aliases=%d\n",
-		boot_cpu_data.dcache.entry_mask,
-		boot_cpu_data.dcache.alias_mask,
-		boot_cpu_data.dcache.n_aliases);
+	start = data->addr1;
+	end = data->addr2;
 
-	/*
-	 * Emit Secondary Cache parameters if the CPU has a probed L2.
-	 */
-	if (boot_cpu_data.flags & CPU_HAS_L2_CACHE) {
-		printk("S-cache : n_ways=%d n_sets=%d way_incr=%d\n",
-			boot_cpu_data.scache.ways,
-			boot_cpu_data.scache.sets,
-			boot_cpu_data.scache.way_incr);
-		printk("S-cache : entry_mask=0x%08x alias_mask=0x%08x n_aliases=%d\n",
-			boot_cpu_data.scache.entry_mask,
-			boot_cpu_data.scache.alias_mask,
-			boot_cpu_data.scache.n_aliases);
+	/* If there are too many pages then just blow away the caches */
+	if (((end - start) >> PAGE_SHIFT) >= MAX_ICACHE_PAGES) {
+		local_flush_cache_all(NULL);
+		return;
 	}
 
-	if (!__flush_dcache_segment_fn)
-		panic("unknown number of cache ways\n");
-}
+	/*
+	 * Selectively flush d-cache then invalidate the i-cache.
+	 * This is inefficient, so only use this for small ranges.
+	 */
+	start &= ~(L1_CACHE_BYTES-1);
+	end += L1_CACHE_BYTES-1;
+	end &= ~(L1_CACHE_BYTES-1);
 
-/*
- * SH-4 has virtually indexed and physically tagged cache.
- */
-void __init p3_cache_init(void)
-{
-	compute_alias(&boot_cpu_data.icache);
-	compute_alias(&boot_cpu_data.dcache);
-	compute_alias(&boot_cpu_data.scache);
-
-	switch (boot_cpu_data.dcache.ways) {
-	case 1:
-		__flush_dcache_segment_fn = __flush_dcache_segment_1way;
-		break;
-	case 2:
-		__flush_dcache_segment_fn = __flush_dcache_segment_2way;
-		break;
-	case 4:
-		__flush_dcache_segment_fn = __flush_dcache_segment_4way;
-		break;
-	default:
-		__flush_dcache_segment_fn = NULL;
-		break;
-	}
+	local_irq_save(flags);
+	jump_to_uncached();
 
-	emit_cache_params();
-}
+	for (v = start; v < end; v += L1_CACHE_BYTES) {
+		unsigned long icacheaddr;
 
-/*
- * Write back the dirty D-caches, but not invalidate them.
- *
- * START: Virtual Address (U0, P1, or P3)
- * SIZE: Size of the region.
- */
-void __flush_wback_region(void *start, int size)
-{
-	unsigned long v;
-	unsigned long begin, end;
-
-	begin = (unsigned long)start & ~(L1_CACHE_BYTES-1);
-	end = ((unsigned long)start + size + L1_CACHE_BYTES-1)
-		& ~(L1_CACHE_BYTES-1);
-	for (v = begin; v < end; v+=L1_CACHE_BYTES) {
-		asm volatile("ocbwb	%0"
-			     : /* no output */
-			     : "m" (__m(v)));
-	}
-}
+		__ocbwb(v);
 
-/*
- * Write back the dirty D-caches and invalidate them.
- *
- * START: Virtual Address (U0, P1, or P3)
- * SIZE: Size of the region.
- */
-void __flush_purge_region(void *start, int size)
-{
-	unsigned long v;
-	unsigned long begin, end;
-
-	begin = (unsigned long)start & ~(L1_CACHE_BYTES-1);
-	end = ((unsigned long)start + size + L1_CACHE_BYTES-1)
-		& ~(L1_CACHE_BYTES-1);
-	for (v = begin; v < end; v+=L1_CACHE_BYTES) {
-		asm volatile("ocbp	%0"
-			     : /* no output */
-			     : "m" (__m(v)));
-	}
-}
+		icacheaddr = CACHE_IC_ADDRESS_ARRAY | (v &
+				cpu_data->icache.entry_mask);
 
-/*
- * No write back please
- */
-void __flush_invalidate_region(void *start, int size)
-{
-	unsigned long v;
-	unsigned long begin, end;
-
-	begin = (unsigned long)start & ~(L1_CACHE_BYTES-1);
-	end = ((unsigned long)start + size + L1_CACHE_BYTES-1)
-		& ~(L1_CACHE_BYTES-1);
-	for (v = begin; v < end; v+=L1_CACHE_BYTES) {
-		asm volatile("ocbi	%0"
-			     : /* no output */
-			     : "m" (__m(v)));
+		/* Clear i-cache line valid-bit */
+		for (i = 0; i < cpu_data->icache.ways; i++) {
+			__raw_writel(0, icacheaddr);
+			icacheaddr += cpu_data->icache.way_incr;
+		}
 	}
-}
-
-/*
- * Write back the range of D-cache, and purge the I-cache.
- *
- * Called from kernel/module.c:sys_init_module and routine for a.out format,
- * signal handler code and kprobes code
- */
-void flush_icache_range(unsigned long start, unsigned long end)
-{
-	int icacheaddr;
-	unsigned long flags, v;
-	int i;
 
-       /* If there are too many pages then just blow the caches */
-        if (((end - start) >> PAGE_SHIFT) >= MAX_ICACHE_PAGES) {
-                flush_cache_all();
-       } else {
-               /* selectively flush d-cache then invalidate the i-cache */
-               /* this is inefficient, so only use for small ranges */
-               start &= ~(L1_CACHE_BYTES-1);
-               end += L1_CACHE_BYTES-1;
-               end &= ~(L1_CACHE_BYTES-1);
-
-               local_irq_save(flags);
-               jump_to_uncached();
-
-               for (v = start; v < end; v+=L1_CACHE_BYTES) {
-                       asm volatile("ocbwb     %0"
-                                    : /* no output */
-                                    : "m" (__m(v)));
-
-                       icacheaddr = CACHE_IC_ADDRESS_ARRAY | (
-                                       v & cpu_data->icache.entry_mask);
-
-                       for (i = 0; i < cpu_data->icache.ways;
-                               i++, icacheaddr += cpu_data->icache.way_incr)
-                                       /* Clear i-cache line valid-bit */
-                                       ctrl_outl(0, icacheaddr);
-               }
-
-		back_to_cached();
-		local_irq_restore(flags);
-	}
+	back_to_cached();
+	local_irq_restore(flags);
 }
 
 static inline void flush_cache_4096(unsigned long start,
@@ -244,9 +112,17 @@ static inline void flush_cache_4096(unsigned long start,
  * Write back & invalidate the D-cache of the page.
  * (To avoid "alias" issues)
  */
-void flush_dcache_page(struct page *page)
+static void sh4_flush_dcache_page(void *arg)
 {
-	if (test_bit(PG_mapped, &page->flags)) {
+	struct page *page = arg;
+#ifndef CONFIG_SMP
+	struct address_space *mapping = page_mapping(page);
+
+	if (mapping && !mapping_mapped(mapping))
+		set_bit(PG_dcache_dirty, &page->flags);
+	else
+#endif
+	{
 		unsigned long phys = PHYSADDR(page_address(page));
 		unsigned long addr = CACHE_OC_ADDRESS_ARRAY;
 		int i, n;
@@ -282,13 +158,13 @@ static void __uses_jump_to_uncached flush_icache_all(void)
 	local_irq_restore(flags);
 }
 
-void flush_dcache_all(void)
+static inline void flush_dcache_all(void)
 {
 	(*__flush_dcache_segment_fn)(0UL, boot_cpu_data.dcache.way_size);
 	wmb();
 }
 
-void flush_cache_all(void)
+static void sh4_flush_cache_all(void *unused)
 {
 	flush_dcache_all();
 	flush_icache_all();
@@ -380,8 +256,13 @@ loop_exit:
  *
  * Caller takes mm->mmap_sem.
  */
-void flush_cache_mm(struct mm_struct *mm)
+static void sh4_flush_cache_mm(void *arg)
 {
+	struct mm_struct *mm = arg;
+
+	if (cpu_context(smp_processor_id(), mm) == NO_CONTEXT)
+		return;
+
 	/*
 	 * If cache is only 4k-per-way, there are never any 'aliases'.  Since
 	 * the cache is physically tagged, the data can just be left in there.
@@ -417,12 +298,21 @@ void flush_cache_mm(struct mm_struct *mm)
  * ADDR: Virtual Address (U0 address)
  * PFN: Physical page number
  */
-void flush_cache_page(struct vm_area_struct *vma, unsigned long address,
-		      unsigned long pfn)
+static void sh4_flush_cache_page(void *args)
 {
-	unsigned long phys = pfn << PAGE_SHIFT;
+	struct flusher_data *data = args;
+	struct vm_area_struct *vma;
+	unsigned long address, pfn, phys;
 	unsigned int alias_mask;
 
+	vma = data->vma;
+	address = data->addr1;
+	pfn = data->addr2;
+	phys = pfn << PAGE_SHIFT;
+
+	if (cpu_context(smp_processor_id(), vma->vm_mm) == NO_CONTEXT)
+		return;
+
 	alias_mask = boot_cpu_data.dcache.alias_mask;
 
 	/* We only need to flush D-cache when we have alias */
@@ -462,9 +352,19 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long address,
  * Flushing the cache lines for U0 only isn't enough.
  * We need to flush for P1 too, which may contain aliases.
  */
-void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
-		       unsigned long end)
+static void sh4_flush_cache_range(void *args)
 {
+	struct flusher_data *data = args;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+
+	vma = data->vma;
+	start = data->addr1;
+	end = data->addr2;
+
+	if (cpu_context(smp_processor_id(), vma->vm_mm) == NO_CONTEXT)
+		return;
+
 	/*
 	 * If cache is only 4k-per-way, there are never any 'aliases'.  Since
 	 * the cache is physically tagged, the data can just be left in there.
@@ -492,20 +392,6 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
 	}
 }
 
-/*
- * flush_icache_user_range
- * @vma: VMA of the process
- * @page: page
- * @addr: U0 address
- * @len: length of the range (< page size)
- */
-void flush_icache_user_range(struct vm_area_struct *vma,
-			     struct page *page, unsigned long addr, int len)
-{
-	flush_cache_page(vma, addr, page_to_pfn(page));
-	mb();
-}
-
 /**
  * __flush_cache_4096
  *
@@ -581,7 +467,49 @@ static void __flush_cache_4096(unsigned long addr, unsigned long phys,
  * Break the 1, 2 and 4 way variants of this out into separate functions to
  * avoid nearly all the overhead of having the conditional stuff in the function
  * bodies (+ the 1 and 2 way cases avoid saving any registers too).
+ *
+ * We want to eliminate unnecessary bus transactions, so this code uses
+ * a non-obvious technique.
+ *
+ * Loop over a cache way sized block of, one cache line at a time. For each
+ * line, use movca.a to cause the current cache line contents to be written
+ * back, but without reading anything from main memory. However this has the
+ * side effect that the cache is now caching that memory location. So follow
+ * this with a cache invalidate to mark the cache line invalid. And do all
+ * this with interrupts disabled, to avoid the cache line being accidently
+ * evicted while it is holding garbage.
+ *
+ * This also breaks in a number of circumstances:
+ * - if there are modifications to the region of memory just above
+ *   empty_zero_page (for example because a breakpoint has been placed
+ *   there), then these can be lost.
+ *
+ *   This is because the the memory address which the cache temporarily
+ *   caches in the above description is empty_zero_page. So the
+ *   movca.l hits the cache (it is assumed that it misses, or at least
+ *   isn't dirty), modifies the line and then invalidates it, losing the
+ *   required change.
+ *
+ * - If caches are disabled or configured in write-through mode, then
+ *   the movca.l writes garbage directly into memory.
  */
+static void __flush_dcache_segment_writethrough(unsigned long start,
+					        unsigned long extent_per_way)
+{
+	unsigned long addr;
+	int i;
+
+	addr = CACHE_OC_ADDRESS_ARRAY | (start & cpu_data->dcache.entry_mask);
+
+	while (extent_per_way) {
+		for (i = 0; i < cpu_data->dcache.ways; i++)
+			__raw_writel(0, addr + cpu_data->dcache.way_incr * i);
+
+		addr += cpu_data->dcache.linesz;
+		extent_per_way -= cpu_data->dcache.linesz;
+	}
+}
+
 static void __flush_dcache_segment_1way(unsigned long start,
 					unsigned long extent_per_way)
 {
@@ -773,3 +701,47 @@ static void __flush_dcache_segment_4way(unsigned long start,
 		a3 += linesz;
 	} while (a0 < a0e);
 }
+
+extern void __weak sh4__flush_region_init(void);
+
+/*
+ * SH-4 has virtually indexed and physically tagged cache.
+ */
+void __init sh4_cache_init(void)
+{
+	unsigned int wt_enabled = !!(__raw_readl(CCR) & CCR_CACHE_WT);
+
+	printk("PVR=%08x CVR=%08x PRR=%08x\n",
+		ctrl_inl(CCN_PVR),
+		ctrl_inl(CCN_CVR),
+		ctrl_inl(CCN_PRR));
+
+	if (wt_enabled)
+		__flush_dcache_segment_fn = __flush_dcache_segment_writethrough;
+	else {
+		switch (boot_cpu_data.dcache.ways) {
+		case 1:
+			__flush_dcache_segment_fn = __flush_dcache_segment_1way;
+			break;
+		case 2:
+			__flush_dcache_segment_fn = __flush_dcache_segment_2way;
+			break;
+		case 4:
+			__flush_dcache_segment_fn = __flush_dcache_segment_4way;
+			break;
+		default:
+			panic("unknown number of cache ways\n");
+			break;
+		}
+	}
+
+	local_flush_icache_range	= sh4_flush_icache_range;
+	local_flush_dcache_page		= sh4_flush_dcache_page;
+	local_flush_cache_all		= sh4_flush_cache_all;
+	local_flush_cache_mm		= sh4_flush_cache_mm;
+	local_flush_cache_dup_mm	= sh4_flush_cache_mm;
+	local_flush_cache_page		= sh4_flush_cache_page;
+	local_flush_cache_range		= sh4_flush_cache_range;
+
+	sh4__flush_region_init();
+}
diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c
index 86762092508c..467ff8e260f7 100644
--- a/arch/sh/mm/cache-sh5.c
+++ b/arch/sh/mm/cache-sh5.c
@@ -20,23 +20,11 @@
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 
+extern void __weak sh4__flush_region_init(void);
+
 /* Wired TLB entry for the D-cache */
 static unsigned long long dtlb_cache_slot;
 
-void __init p3_cache_init(void)
-{
-	/* Reserve a slot for dcache colouring in the DTLB */
-	dtlb_cache_slot	= sh64_get_wired_dtlb_entry();
-}
-
-#ifdef CONFIG_DCACHE_DISABLED
-#define sh64_dcache_purge_all()					do { } while (0)
-#define sh64_dcache_purge_coloured_phy_page(paddr, eaddr)	do { } while (0)
-#define sh64_dcache_purge_user_range(mm, start, end)		do { } while (0)
-#define sh64_dcache_purge_phy_page(paddr)			do { } while (0)
-#define sh64_dcache_purge_virt_page(mm, eaddr)			do { } while (0)
-#endif
-
 /*
  * The following group of functions deal with mapping and unmapping a
  * temporary page into a DTLB slot that has been set aside for exclusive
@@ -56,7 +44,6 @@ static inline void sh64_teardown_dtlb_cache_slot(void)
 	local_irq_enable();
 }
 
-#ifndef CONFIG_ICACHE_DISABLED
 static inline void sh64_icache_inv_all(void)
 {
 	unsigned long long addr, flag, data;
@@ -214,52 +201,6 @@ static void sh64_icache_inv_user_page_range(struct mm_struct *mm,
 	}
 }
 
-/*
- * Invalidate a small range of user context I-cache, not necessarily page
- * (or even cache-line) aligned.
- *
- * Since this is used inside ptrace, the ASID in the mm context typically
- * won't match current_asid.  We'll have to switch ASID to do this.  For
- * safety, and given that the range will be small, do all this under cli.
- *
- * Note, there is a hazard that the ASID in mm->context is no longer
- * actually associated with mm, i.e. if the mm->context has started a new
- * cycle since mm was last active.  However, this is just a performance
- * issue: all that happens is that we invalidate lines belonging to
- * another mm, so the owning process has to refill them when that mm goes
- * live again.  mm itself can't have any cache entries because there will
- * have been a flush_cache_all when the new mm->context cycle started.
- */
-static void sh64_icache_inv_user_small_range(struct mm_struct *mm,
-						unsigned long start, int len)
-{
-	unsigned long long eaddr = start;
-	unsigned long long eaddr_end = start + len;
-	unsigned long current_asid, mm_asid;
-	unsigned long flags;
-	unsigned long long epage_start;
-
-	/*
-	 * Align to start of cache line.  Otherwise, suppose len==8 and
-	 * start was at 32N+28 : the last 4 bytes wouldn't get invalidated.
-	 */
-	eaddr = L1_CACHE_ALIGN(start);
-	eaddr_end = start + len;
-
-	mm_asid = cpu_asid(smp_processor_id(), mm);
-	local_irq_save(flags);
-	current_asid = switch_and_save_asid(mm_asid);
-
-	epage_start = eaddr & PAGE_MASK;
-
-	while (eaddr < eaddr_end) {
-		__asm__ __volatile__("icbi %0, 0" : : "r" (eaddr));
-		eaddr += L1_CACHE_BYTES;
-	}
-	switch_and_save_asid(current_asid);
-	local_irq_restore(flags);
-}
-
 static void sh64_icache_inv_current_user_range(unsigned long start, unsigned long end)
 {
 	/* The icbi instruction never raises ITLBMISS.  i.e. if there's not a
@@ -287,9 +228,7 @@ static void sh64_icache_inv_current_user_range(unsigned long start, unsigned lon
 		addr += L1_CACHE_BYTES;
 	}
 }
-#endif /* !CONFIG_ICACHE_DISABLED */
 
-#ifndef CONFIG_DCACHE_DISABLED
 /* Buffer used as the target of alloco instructions to purge data from cache
    sets by natural eviction. -- RPC */
 #define DUMMY_ALLOCO_AREA_SIZE ((L1_CACHE_BYTES << 10) + (1024 * 4))
@@ -541,59 +480,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm,
 }
 
 /*
- * Purge the range of addresses from the D-cache.
- *
- * The addresses lie in the superpage mapping. There's no harm if we
- * overpurge at either end - just a small performance loss.
- */
-void __flush_purge_region(void *start, int size)
-{
-	unsigned long long ullend, addr, aligned_start;
-
-	aligned_start = (unsigned long long)(signed long long)(signed long) start;
-	addr = L1_CACHE_ALIGN(aligned_start);
-	ullend = (unsigned long long) (signed long long) (signed long) start + size;
-
-	while (addr <= ullend) {
-		__asm__ __volatile__ ("ocbp %0, 0" : : "r" (addr));
-		addr += L1_CACHE_BYTES;
-	}
-}
-
-void __flush_wback_region(void *start, int size)
-{
-	unsigned long long ullend, addr, aligned_start;
-
-	aligned_start = (unsigned long long)(signed long long)(signed long) start;
-	addr = L1_CACHE_ALIGN(aligned_start);
-	ullend = (unsigned long long) (signed long long) (signed long) start + size;
-
-	while (addr < ullend) {
-		__asm__ __volatile__ ("ocbwb %0, 0" : : "r" (addr));
-		addr += L1_CACHE_BYTES;
-	}
-}
-
-void __flush_invalidate_region(void *start, int size)
-{
-	unsigned long long ullend, addr, aligned_start;
-
-	aligned_start = (unsigned long long)(signed long long)(signed long) start;
-	addr = L1_CACHE_ALIGN(aligned_start);
-	ullend = (unsigned long long) (signed long long) (signed long) start + size;
-
-	while (addr < ullend) {
-		__asm__ __volatile__ ("ocbi %0, 0" : : "r" (addr));
-		addr += L1_CACHE_BYTES;
-	}
-}
-#endif /* !CONFIG_DCACHE_DISABLED */
-
-/*
  * Invalidate the entire contents of both caches, after writing back to
  * memory any dirty data from the D-cache.
  */
-void flush_cache_all(void)
+static void sh5_flush_cache_all(void *unused)
 {
 	sh64_dcache_purge_all();
 	sh64_icache_inv_all();
@@ -620,7 +510,7 @@ void flush_cache_all(void)
  * I-cache.  This is similar to the lack of action needed in
  * flush_tlb_mm - see fault.c.
  */
-void flush_cache_mm(struct mm_struct *mm)
+static void sh5_flush_cache_mm(void *unused)
 {
 	sh64_dcache_purge_all();
 }
@@ -632,13 +522,18 @@ void flush_cache_mm(struct mm_struct *mm)
  *
  * Note, 'end' is 1 byte beyond the end of the range to flush.
  */
-void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
-		       unsigned long end)
+static void sh5_flush_cache_range(void *args)
 {
-	struct mm_struct *mm = vma->vm_mm;
+	struct flusher_data *data = args;
+	struct vm_area_struct *vma;
+	unsigned long start, end;
+
+	vma = data->vma;
+	start = data->addr1;
+	end = data->addr2;
 
-	sh64_dcache_purge_user_range(mm, start, end);
-	sh64_icache_inv_user_page_range(mm, start, end);
+	sh64_dcache_purge_user_range(vma->vm_mm, start, end);
+	sh64_icache_inv_user_page_range(vma->vm_mm, start, end);
 }
 
 /*
@@ -650,16 +545,23 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
  *
  * Note, this is called with pte lock held.
  */
-void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr,
-		      unsigned long pfn)
+static void sh5_flush_cache_page(void *args)
 {
+	struct flusher_data *data = args;
+	struct vm_area_struct *vma;
+	unsigned long eaddr, pfn;
+
+	vma = data->vma;
+	eaddr = data->addr1;
+	pfn = data->addr2;
+
 	sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT);
 
 	if (vma->vm_flags & VM_EXEC)
 		sh64_icache_inv_user_page(vma, eaddr);
 }
 
-void flush_dcache_page(struct page *page)
+static void sh5_flush_dcache_page(void *page)
 {
 	sh64_dcache_purge_phy_page(page_to_phys(page));
 	wmb();
@@ -673,162 +575,47 @@ void flush_dcache_page(struct page *page)
  * mapping, therefore it's guaranteed that there no cache entries for
  * the range in cache sets of the wrong colour.
  */
-void flush_icache_range(unsigned long start, unsigned long end)
+static void sh5_flush_icache_range(void *args)
 {
+	struct flusher_data *data = args;
+	unsigned long start, end;
+
+	start = data->addr1;
+	end = data->addr2;
+
 	__flush_purge_region((void *)start, end);
 	wmb();
 	sh64_icache_inv_kernel_range(start, end);
 }
 
 /*
- * Flush the range of user (defined by vma->vm_mm) address space starting
- * at 'addr' for 'len' bytes from the cache.  The range does not straddle
- * a page boundary, the unique physical page containing the range is
- * 'page'.  This seems to be used mainly for invalidating an address
- * range following a poke into the program text through the ptrace() call
- * from another process (e.g. for BRK instruction insertion).
- */
-void flush_icache_user_range(struct vm_area_struct *vma,
-			struct page *page, unsigned long addr, int len)
-{
-
-	sh64_dcache_purge_coloured_phy_page(page_to_phys(page), addr);
-	mb();
-
-	if (vma->vm_flags & VM_EXEC)
-		sh64_icache_inv_user_small_range(vma->vm_mm, addr, len);
-}
-
-/*
  * For the address range [start,end), write back the data from the
  * D-cache and invalidate the corresponding region of the I-cache for the
  * current process.  Used to flush signal trampolines on the stack to
  * make them executable.
  */
-void flush_cache_sigtramp(unsigned long vaddr)
+static void sh5_flush_cache_sigtramp(void *vaddr)
 {
-	unsigned long end = vaddr + L1_CACHE_BYTES;
+	unsigned long end = (unsigned long)vaddr + L1_CACHE_BYTES;
 
-	__flush_wback_region((void *)vaddr, L1_CACHE_BYTES);
+	__flush_wback_region(vaddr, L1_CACHE_BYTES);
 	wmb();
-	sh64_icache_inv_current_user_range(vaddr, end);
+	sh64_icache_inv_current_user_range((unsigned long)vaddr, end);
 }
 
-#ifdef CONFIG_MMU
-/*
- * These *MUST* lie in an area of virtual address space that's otherwise
- * unused.
- */
-#define UNIQUE_EADDR_START 0xe0000000UL
-#define UNIQUE_EADDR_END   0xe8000000UL
-
-/*
- * Given a physical address paddr, and a user virtual address user_eaddr
- * which will eventually be mapped to it, create a one-off kernel-private
- * eaddr mapped to the same paddr.  This is used for creating special
- * destination pages for copy_user_page and clear_user_page.
- */
-static unsigned long sh64_make_unique_eaddr(unsigned long user_eaddr,
-					    unsigned long paddr)
-{
-	static unsigned long current_pointer = UNIQUE_EADDR_START;
-	unsigned long coloured_pointer;
-
-	if (current_pointer == UNIQUE_EADDR_END) {
-		sh64_dcache_purge_all();
-		current_pointer = UNIQUE_EADDR_START;
-	}
-
-	coloured_pointer = (current_pointer & ~CACHE_OC_SYN_MASK) |
-				(user_eaddr & CACHE_OC_SYN_MASK);
-	sh64_setup_dtlb_cache_slot(coloured_pointer, get_asid(), paddr);
-
-	current_pointer += (PAGE_SIZE << CACHE_OC_N_SYNBITS);
-
-	return coloured_pointer;
-}
-
-static void sh64_copy_user_page_coloured(void *to, void *from,
-					 unsigned long address)
+void __init sh5_cache_init(void)
 {
-	void *coloured_to;
+	local_flush_cache_all		= sh5_flush_cache_all;
+	local_flush_cache_mm		= sh5_flush_cache_mm;
+	local_flush_cache_dup_mm	= sh5_flush_cache_mm;
+	local_flush_cache_page		= sh5_flush_cache_page;
+	local_flush_cache_range		= sh5_flush_cache_range;
+	local_flush_dcache_page		= sh5_flush_dcache_page;
+	local_flush_icache_range	= sh5_flush_icache_range;
+	local_flush_cache_sigtramp	= sh5_flush_cache_sigtramp;
 
-	/*
-	 * Discard any existing cache entries of the wrong colour.  These are
-	 * present quite often, if the kernel has recently used the page
-	 * internally, then given it up, then it's been allocated to the user.
-	 */
-	sh64_dcache_purge_coloured_phy_page(__pa(to), (unsigned long)to);
-
-	coloured_to = (void *)sh64_make_unique_eaddr(address, __pa(to));
-	copy_page(from, coloured_to);
-
-	sh64_teardown_dtlb_cache_slot();
-}
-
-static void sh64_clear_user_page_coloured(void *to, unsigned long address)
-{
-	void *coloured_to;
-
-	/*
-	 * Discard any existing kernel-originated lines of the wrong
-	 * colour (as above)
-	 */
-	sh64_dcache_purge_coloured_phy_page(__pa(to), (unsigned long)to);
-
-	coloured_to = (void *)sh64_make_unique_eaddr(address, __pa(to));
-	clear_page(coloured_to);
-
-	sh64_teardown_dtlb_cache_slot();
-}
-
-/*
- * 'from' and 'to' are kernel virtual addresses (within the superpage
- * mapping of the physical RAM).  'address' is the user virtual address
- * where the copy 'to' will be mapped after.  This allows a custom
- * mapping to be used to ensure that the new copy is placed in the
- * right cache sets for the user to see it without having to bounce it
- * out via memory.  Note however : the call to flush_page_to_ram in
- * (generic)/mm/memory.c:(break_cow) undoes all this good work in that one
- * very important case!
- *
- * TBD : can we guarantee that on every call, any cache entries for
- * 'from' are in the same colour sets as 'address' also?  i.e. is this
- * always used just to deal with COW?  (I suspect not).
- *
- * There are two possibilities here for when the page 'from' was last accessed:
- * - by the kernel : this is OK, no purge required.
- * - by the/a user (e.g. for break_COW) : need to purge.
- *
- * If the potential user mapping at 'address' is the same colour as
- * 'from' there is no need to purge any cache lines from the 'from'
- * page mapped into cache sets of colour 'address'.  (The copy will be
- * accessing the page through 'from').
- */
-void copy_user_page(void *to, void *from, unsigned long address,
-		    struct page *page)
-{
-	if (((address ^ (unsigned long) from) & CACHE_OC_SYN_MASK) != 0)
-		sh64_dcache_purge_coloured_phy_page(__pa(from), address);
-
-	if (((address ^ (unsigned long) to) & CACHE_OC_SYN_MASK) == 0)
-		copy_page(to, from);
-	else
-		sh64_copy_user_page_coloured(to, from, address);
-}
+	/* Reserve a slot for dcache colouring in the DTLB */
+	dtlb_cache_slot	= sh64_get_wired_dtlb_entry();
 
-/*
- * 'to' is a kernel virtual address (within the superpage mapping of the
- * physical RAM).  'address' is the user virtual address where the 'to'
- * page will be mapped after.  This allows a custom mapping to be used to
- * ensure that the new copy is placed in the right cache sets for the
- * user to see it without having to bounce it out via memory.
- */
-void clear_user_page(void *to, unsigned long address, struct page *page)
-{
-	if (((address ^ (unsigned long) to) & CACHE_OC_SYN_MASK) == 0)
-		clear_page(to);
-	else
-		sh64_clear_user_page_coloured(to, address);
+	sh4__flush_region_init();
 }
-#endif
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index 22dacc778823..2cadee2037ac 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/fs.h>
 #include <linux/threads.h>
 #include <asm/addrspace.h>
 #include <asm/page.h>
@@ -63,15 +64,21 @@ static inline void cache_wback_all(void)
  *
  * Called from kernel/module.c:sys_init_module and routine for a.out format.
  */
-void flush_icache_range(unsigned long start, unsigned long end)
+static void sh7705_flush_icache_range(void *args)
 {
+	struct flusher_data *data = args;
+	unsigned long start, end;
+
+	start = data->addr1;
+	end = data->addr2;
+
 	__flush_wback_region((void *)start, end - start);
 }
 
 /*
  * Writeback&Invalidate the D-cache of the page
  */
-static void __uses_jump_to_uncached __flush_dcache_page(unsigned long phys)
+static void __flush_dcache_page(unsigned long phys)
 {
 	unsigned long ways, waysize, addrstart;
 	unsigned long flags;
@@ -126,13 +133,18 @@ static void __uses_jump_to_uncached __flush_dcache_page(unsigned long phys)
  * Write back & invalidate the D-cache of the page.
  * (To avoid "alias" issues)
  */
-void flush_dcache_page(struct page *page)
+static void sh7705_flush_dcache_page(void *arg)
 {
-	if (test_bit(PG_mapped, &page->flags))
+	struct page *page = arg;
+	struct address_space *mapping = page_mapping(page);
+
+	if (mapping && !mapping_mapped(mapping))
+		set_bit(PG_dcache_dirty, &page->flags);
+	else
 		__flush_dcache_page(PHYSADDR(page_address(page)));
 }
 
-void __uses_jump_to_uncached flush_cache_all(void)
+static void sh7705_flush_cache_all(void *args)
 {
 	unsigned long flags;
 
@@ -144,44 +156,16 @@ void __uses_jump_to_uncached flush_cache_all(void)
 	local_irq_restore(flags);
 }
 
-void flush_cache_mm(struct mm_struct *mm)
-{
-	/* Is there any good way? */
-	/* XXX: possibly call flush_cache_range for each vm area */
-	flush_cache_all();
-}
-
-/*
- * Write back and invalidate D-caches.
- *
- * START, END: Virtual Address (U0 address)
- *
- * NOTE: We need to flush the _physical_ page entry.
- * Flushing the cache lines for U0 only isn't enough.
- * We need to flush for P1 too, which may contain aliases.
- */
-void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
-		       unsigned long end)
-{
-
-	/*
-	 * We could call flush_cache_page for the pages of these range,
-	 * but it's not efficient (scan the caches all the time...).
-	 *
-	 * We can't use A-bit magic, as there's the case we don't have
-	 * valid entry on TLB.
-	 */
-	flush_cache_all();
-}
-
 /*
  * Write back and invalidate I/D-caches for the page.
  *
  * ADDRESS: Virtual Address (U0 address)
  */
-void flush_cache_page(struct vm_area_struct *vma, unsigned long address,
-		      unsigned long pfn)
+static void sh7705_flush_cache_page(void *args)
 {
+	struct flusher_data *data = args;
+	unsigned long pfn = data->addr2;
+
 	__flush_dcache_page(pfn << PAGE_SHIFT);
 }
 
@@ -193,7 +177,19 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long address,
  * Not entirely sure why this is necessary on SH3 with 32K cache but
  * without it we get occasional "Memory fault" when loading a program.
  */
-void flush_icache_page(struct vm_area_struct *vma, struct page *page)
+static void sh7705_flush_icache_page(void *page)
 {
 	__flush_purge_region(page_address(page), PAGE_SIZE);
 }
+
+void __init sh7705_cache_init(void)
+{
+	local_flush_icache_range	= sh7705_flush_icache_range;
+	local_flush_dcache_page		= sh7705_flush_dcache_page;
+	local_flush_cache_all		= sh7705_flush_cache_all;
+	local_flush_cache_mm		= sh7705_flush_cache_all;
+	local_flush_cache_dup_mm	= sh7705_flush_cache_all;
+	local_flush_cache_range		= sh7705_flush_cache_all;
+	local_flush_cache_page		= sh7705_flush_cache_page;
+	local_flush_icache_page		= sh7705_flush_icache_page;
+}
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c
new file mode 100644
index 000000000000..35c37b7f717a
--- /dev/null
+++ b/arch/sh/mm/cache.c
@@ -0,0 +1,316 @@
+/*
+ * arch/sh/mm/cache.c
+ *
+ * Copyright (C) 1999, 2000, 2002  Niibe Yutaka
+ * Copyright (C) 2002 - 2009  Paul Mundt
+ *
+ * Released under the terms of the GNU GPL v2.0.
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+
+void (*local_flush_cache_all)(void *args) = cache_noop;
+void (*local_flush_cache_mm)(void *args) = cache_noop;
+void (*local_flush_cache_dup_mm)(void *args) = cache_noop;
+void (*local_flush_cache_page)(void *args) = cache_noop;
+void (*local_flush_cache_range)(void *args) = cache_noop;
+void (*local_flush_dcache_page)(void *args) = cache_noop;
+void (*local_flush_icache_range)(void *args) = cache_noop;
+void (*local_flush_icache_page)(void *args) = cache_noop;
+void (*local_flush_cache_sigtramp)(void *args) = cache_noop;
+
+void (*__flush_wback_region)(void *start, int size);
+void (*__flush_purge_region)(void *start, int size);
+void (*__flush_invalidate_region)(void *start, int size);
+
+static inline void noop__flush_region(void *start, int size)
+{
+}
+
+static inline void cacheop_on_each_cpu(void (*func) (void *info), void *info,
+                                   int wait)
+{
+	preempt_disable();
+	smp_call_function(func, info, wait);
+	func(info);
+	preempt_enable();
+}
+
+void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
+		       unsigned long vaddr, void *dst, const void *src,
+		       unsigned long len)
+{
+	if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+	    !test_bit(PG_dcache_dirty, &page->flags)) {
+		void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
+		memcpy(vto, src, len);
+		kunmap_coherent(vto);
+	} else {
+		memcpy(dst, src, len);
+		if (boot_cpu_data.dcache.n_aliases)
+			set_bit(PG_dcache_dirty, &page->flags);
+	}
+
+	if (vma->vm_flags & VM_EXEC)
+		flush_cache_page(vma, vaddr, page_to_pfn(page));
+}
+
+void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
+			 unsigned long vaddr, void *dst, const void *src,
+			 unsigned long len)
+{
+	if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+	    !test_bit(PG_dcache_dirty, &page->flags)) {
+		void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
+		memcpy(dst, vfrom, len);
+		kunmap_coherent(vfrom);
+	} else {
+		memcpy(dst, src, len);
+		if (boot_cpu_data.dcache.n_aliases)
+			set_bit(PG_dcache_dirty, &page->flags);
+	}
+}
+
+void copy_user_highpage(struct page *to, struct page *from,
+			unsigned long vaddr, struct vm_area_struct *vma)
+{
+	void *vfrom, *vto;
+
+	vto = kmap_atomic(to, KM_USER1);
+
+	if (boot_cpu_data.dcache.n_aliases && page_mapped(from) &&
+	    !test_bit(PG_dcache_dirty, &from->flags)) {
+		vfrom = kmap_coherent(from, vaddr);
+		copy_page(vto, vfrom);
+		kunmap_coherent(vfrom);
+	} else {
+		vfrom = kmap_atomic(from, KM_USER0);
+		copy_page(vto, vfrom);
+		kunmap_atomic(vfrom, KM_USER0);
+	}
+
+	if (pages_do_alias((unsigned long)vto, vaddr & PAGE_MASK))
+		__flush_purge_region(vto, PAGE_SIZE);
+
+	kunmap_atomic(vto, KM_USER1);
+	/* Make sure this page is cleared on other CPU's too before using it */
+	smp_wmb();
+}
+EXPORT_SYMBOL(copy_user_highpage);
+
+void clear_user_highpage(struct page *page, unsigned long vaddr)
+{
+	void *kaddr = kmap_atomic(page, KM_USER0);
+
+	clear_page(kaddr);
+
+	if (pages_do_alias((unsigned long)kaddr, vaddr & PAGE_MASK))
+		__flush_purge_region(kaddr, PAGE_SIZE);
+
+	kunmap_atomic(kaddr, KM_USER0);
+}
+EXPORT_SYMBOL(clear_user_highpage);
+
+void __update_cache(struct vm_area_struct *vma,
+		    unsigned long address, pte_t pte)
+{
+	struct page *page;
+	unsigned long pfn = pte_pfn(pte);
+
+	if (!boot_cpu_data.dcache.n_aliases)
+		return;
+
+	page = pfn_to_page(pfn);
+	if (pfn_valid(pfn) && page_mapping(page)) {
+		int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags);
+		if (dirty) {
+			unsigned long addr = (unsigned long)page_address(page);
+
+			if (pages_do_alias(addr, address & PAGE_MASK))
+				__flush_purge_region((void *)addr, PAGE_SIZE);
+		}
+	}
+}
+
+void __flush_anon_page(struct page *page, unsigned long vmaddr)
+{
+	unsigned long addr = (unsigned long) page_address(page);
+
+	if (pages_do_alias(addr, vmaddr)) {
+		if (boot_cpu_data.dcache.n_aliases && page_mapped(page) &&
+		    !test_bit(PG_dcache_dirty, &page->flags)) {
+			void *kaddr;
+
+			kaddr = kmap_coherent(page, vmaddr);
+			/* XXX.. For now kunmap_coherent() does a purge */
+			/* __flush_purge_region((void *)kaddr, PAGE_SIZE); */
+			kunmap_coherent(kaddr);
+		} else
+			__flush_purge_region((void *)addr, PAGE_SIZE);
+	}
+}
+
+void flush_cache_all(void)
+{
+	cacheop_on_each_cpu(local_flush_cache_all, NULL, 1);
+}
+
+void flush_cache_mm(struct mm_struct *mm)
+{
+	cacheop_on_each_cpu(local_flush_cache_mm, mm, 1);
+}
+
+void flush_cache_dup_mm(struct mm_struct *mm)
+{
+	cacheop_on_each_cpu(local_flush_cache_dup_mm, mm, 1);
+}
+
+void flush_cache_page(struct vm_area_struct *vma, unsigned long addr,
+		      unsigned long pfn)
+{
+	struct flusher_data data;
+
+	data.vma = vma;
+	data.addr1 = addr;
+	data.addr2 = pfn;
+
+	cacheop_on_each_cpu(local_flush_cache_page, (void *)&data, 1);
+}
+
+void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
+		       unsigned long end)
+{
+	struct flusher_data data;
+
+	data.vma = vma;
+	data.addr1 = start;
+	data.addr2 = end;
+
+	cacheop_on_each_cpu(local_flush_cache_range, (void *)&data, 1);
+}
+
+void flush_dcache_page(struct page *page)
+{
+	cacheop_on_each_cpu(local_flush_dcache_page, page, 1);
+}
+
+void flush_icache_range(unsigned long start, unsigned long end)
+{
+	struct flusher_data data;
+
+	data.vma = NULL;
+	data.addr1 = start;
+	data.addr2 = end;
+
+	cacheop_on_each_cpu(local_flush_icache_range, (void *)&data, 1);
+}
+
+void flush_icache_page(struct vm_area_struct *vma, struct page *page)
+{
+	/* Nothing uses the VMA, so just pass the struct page along */
+	cacheop_on_each_cpu(local_flush_icache_page, page, 1);
+}
+
+void flush_cache_sigtramp(unsigned long address)
+{
+	cacheop_on_each_cpu(local_flush_cache_sigtramp, (void *)address, 1);
+}
+
+static void compute_alias(struct cache_info *c)
+{
+	c->alias_mask = ((c->sets - 1) << c->entry_shift) & ~(PAGE_SIZE - 1);
+	c->n_aliases = c->alias_mask ? (c->alias_mask >> PAGE_SHIFT) + 1 : 0;
+}
+
+static void __init emit_cache_params(void)
+{
+	printk(KERN_NOTICE "I-cache : n_ways=%d n_sets=%d way_incr=%d\n",
+		boot_cpu_data.icache.ways,
+		boot_cpu_data.icache.sets,
+		boot_cpu_data.icache.way_incr);
+	printk(KERN_NOTICE "I-cache : entry_mask=0x%08x alias_mask=0x%08x n_aliases=%d\n",
+		boot_cpu_data.icache.entry_mask,
+		boot_cpu_data.icache.alias_mask,
+		boot_cpu_data.icache.n_aliases);
+	printk(KERN_NOTICE "D-cache : n_ways=%d n_sets=%d way_incr=%d\n",
+		boot_cpu_data.dcache.ways,
+		boot_cpu_data.dcache.sets,
+		boot_cpu_data.dcache.way_incr);
+	printk(KERN_NOTICE "D-cache : entry_mask=0x%08x alias_mask=0x%08x n_aliases=%d\n",
+		boot_cpu_data.dcache.entry_mask,
+		boot_cpu_data.dcache.alias_mask,
+		boot_cpu_data.dcache.n_aliases);
+
+	/*
+	 * Emit Secondary Cache parameters if the CPU has a probed L2.
+	 */
+	if (boot_cpu_data.flags & CPU_HAS_L2_CACHE) {
+		printk(KERN_NOTICE "S-cache : n_ways=%d n_sets=%d way_incr=%d\n",
+			boot_cpu_data.scache.ways,
+			boot_cpu_data.scache.sets,
+			boot_cpu_data.scache.way_incr);
+		printk(KERN_NOTICE "S-cache : entry_mask=0x%08x alias_mask=0x%08x n_aliases=%d\n",
+			boot_cpu_data.scache.entry_mask,
+			boot_cpu_data.scache.alias_mask,
+			boot_cpu_data.scache.n_aliases);
+	}
+}
+
+void __init cpu_cache_init(void)
+{
+	compute_alias(&boot_cpu_data.icache);
+	compute_alias(&boot_cpu_data.dcache);
+	compute_alias(&boot_cpu_data.scache);
+
+	__flush_wback_region		= noop__flush_region;
+	__flush_purge_region		= noop__flush_region;
+	__flush_invalidate_region	= noop__flush_region;
+
+	if (boot_cpu_data.family == CPU_FAMILY_SH2) {
+		extern void __weak sh2_cache_init(void);
+
+		sh2_cache_init();
+	}
+
+	if (boot_cpu_data.family == CPU_FAMILY_SH2A) {
+		extern void __weak sh2a_cache_init(void);
+
+		sh2a_cache_init();
+	}
+
+	if (boot_cpu_data.family == CPU_FAMILY_SH3) {
+		extern void __weak sh3_cache_init(void);
+
+		sh3_cache_init();
+
+		if ((boot_cpu_data.type == CPU_SH7705) &&
+		    (boot_cpu_data.dcache.sets == 512)) {
+			extern void __weak sh7705_cache_init(void);
+
+			sh7705_cache_init();
+		}
+	}
+
+	if ((boot_cpu_data.family == CPU_FAMILY_SH4) ||
+	    (boot_cpu_data.family == CPU_FAMILY_SH4A) ||
+	    (boot_cpu_data.family == CPU_FAMILY_SH4AL_DSP)) {
+		extern void __weak sh4_cache_init(void);
+
+		sh4_cache_init();
+	}
+
+	if (boot_cpu_data.family == CPU_FAMILY_SH5) {
+		extern void __weak sh5_cache_init(void);
+
+		sh5_cache_init();
+	}
+
+	emit_cache_params();
+}
diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c
index 71925946f1e1..47530104e0ad 100644
--- a/arch/sh/mm/fault_32.c
+++ b/arch/sh/mm/fault_32.c
@@ -2,7 +2,7 @@
  * Page fault handler for SH with an MMU.
  *
  *  Copyright (C) 1999  Niibe Yutaka
- *  Copyright (C) 2003 - 2008  Paul Mundt
+ *  Copyright (C) 2003 - 2009  Paul Mundt
  *
  *  Based on linux/arch/i386/mm/fault.c:
  *   Copyright (C) 1995  Linus Torvalds
@@ -15,7 +15,7 @@
 #include <linux/mm.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <asm/io_trapped.h>
 #include <asm/system.h>
 #include <asm/mmu_context.h>
@@ -25,18 +25,91 @@ static inline int notify_page_fault(struct pt_regs *regs, int trap)
 {
 	int ret = 0;
 
-#ifdef CONFIG_KPROBES
-	if (!user_mode(regs)) {
+	if (kprobes_built_in() && !user_mode(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, trap))
 			ret = 1;
 		preempt_enable();
 	}
-#endif
 
 	return ret;
 }
 
+static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
+{
+	unsigned index = pgd_index(address);
+	pgd_t *pgd_k;
+	pud_t *pud, *pud_k;
+	pmd_t *pmd, *pmd_k;
+
+	pgd += index;
+	pgd_k = init_mm.pgd + index;
+
+	if (!pgd_present(*pgd_k))
+		return NULL;
+
+	pud = pud_offset(pgd, address);
+	pud_k = pud_offset(pgd_k, address);
+	if (!pud_present(*pud_k))
+		return NULL;
+
+	pmd = pmd_offset(pud, address);
+	pmd_k = pmd_offset(pud_k, address);
+	if (!pmd_present(*pmd_k))
+		return NULL;
+
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, *pmd_k);
+	else {
+		/*
+		 * The page tables are fully synchronised so there must
+		 * be another reason for the fault. Return NULL here to
+		 * signal that we have not taken care of the fault.
+		 */
+		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+		return NULL;
+	}
+
+	return pmd_k;
+}
+
+/*
+ * Handle a fault on the vmalloc or module mapping area
+ */
+static noinline int vmalloc_fault(unsigned long address)
+{
+	pgd_t *pgd_k;
+	pmd_t *pmd_k;
+	pte_t *pte_k;
+
+	/* Make sure we are in vmalloc/module/P3 area: */
+	if (!(address >= VMALLOC_START && address < P3_ADDR_MAX))
+		return -1;
+
+	/*
+	 * Synchronize this task's top level page-table
+	 * with the 'reference' page table.
+	 *
+	 * Do _not_ use "current" here. We might be inside
+	 * an interrupt in the middle of a task switch..
+	 */
+	pgd_k = get_TTB();
+	pmd_k = vmalloc_sync_one(pgd_k, address);
+	if (!pmd_k)
+		return -1;
+
+	pte_k = pte_offset_kernel(pmd_k, address);
+	if (!pte_present(*pte_k))
+		return -1;
+
+	return 0;
+}
+
+static int fault_in_kernel_space(unsigned long address)
+{
+	return address >= TASK_SIZE;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -46,6 +119,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 					unsigned long writeaccess,
 					unsigned long address)
 {
+	unsigned long vec;
 	struct task_struct *tsk;
 	struct mm_struct *mm;
 	struct vm_area_struct * vma;
@@ -53,70 +127,41 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 	int fault;
 	siginfo_t info;
 
-	/*
-	 * We don't bother with any notifier callbacks here, as they are
-	 * all handled through the __do_page_fault() fast-path.
-	 */
-
 	tsk = current;
+	mm = tsk->mm;
 	si_code = SEGV_MAPERR;
+	vec = lookup_exception_vector();
 
-	if (unlikely(address >= TASK_SIZE)) {
-		/*
-		 * Synchronize this task's top level page-table
-		 * with the 'reference' page table.
-		 *
-		 * Do _not_ use "tsk" here. We might be inside
-		 * an interrupt in the middle of a task switch..
-		 */
-		int offset = pgd_index(address);
-		pgd_t *pgd, *pgd_k;
-		pud_t *pud, *pud_k;
-		pmd_t *pmd, *pmd_k;
-
-		pgd = get_TTB() + offset;
-		pgd_k = swapper_pg_dir + offset;
-
-		if (!pgd_present(*pgd)) {
-			if (!pgd_present(*pgd_k))
-				goto bad_area_nosemaphore;
-			set_pgd(pgd, *pgd_k);
+	/*
+	 * We fault-in kernel-space virtual memory on-demand. The
+	 * 'reference' page table is init_mm.pgd.
+	 *
+	 * NOTE! We MUST NOT take any locks for this case. We may
+	 * be in an interrupt or a critical region, and should
+	 * only copy the information from the master page table,
+	 * nothing more.
+	 */
+	if (unlikely(fault_in_kernel_space(address))) {
+		if (vmalloc_fault(address) >= 0)
 			return;
-		}
-
-		pud = pud_offset(pgd, address);
-		pud_k = pud_offset(pgd_k, address);
-
-		if (!pud_present(*pud)) {
-			if (!pud_present(*pud_k))
-				goto bad_area_nosemaphore;
-			set_pud(pud, *pud_k);
+		if (notify_page_fault(regs, vec))
 			return;
-		}
 
-		pmd = pmd_offset(pud, address);
-		pmd_k = pmd_offset(pud_k, address);
-		if (pmd_present(*pmd) || !pmd_present(*pmd_k))
-			goto bad_area_nosemaphore;
-		set_pmd(pmd, *pmd_k);
-
-		return;
+		goto bad_area_nosemaphore;
 	}
 
-	mm = tsk->mm;
-
-	if (unlikely(notify_page_fault(regs, lookup_exception_vector())))
+	if (unlikely(notify_page_fault(regs, vec)))
 		return;
 
 	/* Only enable interrupts if they were on before the fault */
 	if ((regs->sr & SR_IMASK) != SR_IMASK)
 		local_irq_enable();
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
-	 * If we're in an interrupt or have no user
-	 * context, we must not take the fault..
+	 * If we're in an interrupt, have no user context or are running
+	 * in an atomic region then we must not take the fault:
 	 */
 	if (in_atomic() || !mm)
 		goto no_context;
@@ -132,10 +177,11 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
 		goto bad_area;
 	if (expand_stack(vma, address))
 		goto bad_area;
-/*
- * Ok, we have a good vm_area for this memory access, so
- * we can handle it..
- */
+
+	/*
+	 * Ok, we have a good vm_area for this memory access, so
+	 * we can handle it..
+	 */
 good_area:
 	si_code = SEGV_ACCERR;
 	if (writeaccess) {
@@ -162,21 +208,21 @@ survive:
 	}
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
 	up_read(&mm->mmap_sem);
 	return;
 
-/*
- * Something tried to access memory that isn't in our memory map..
- * Fix it, but check if it's kernel or user first..
- */
+	/*
+	 * Something tried to access memory that isn't in our memory map..
+	 * Fix it, but check if it's kernel or user first..
+	 */
 bad_area:
 	up_read(&mm->mmap_sem);
 
@@ -272,16 +318,15 @@ do_sigbus:
 /*
  * Called with interrupts disabled.
  */
-asmlinkage int __kprobes __do_page_fault(struct pt_regs *regs,
-					 unsigned long writeaccess,
-					 unsigned long address)
+asmlinkage int __kprobes
+handle_tlbmiss(struct pt_regs *regs, unsigned long writeaccess,
+	       unsigned long address)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 	pte_t entry;
-	int ret = 1;
 
 	/*
 	 * We don't take page faults for P1, P2, and parts of P4, these
@@ -292,40 +337,41 @@ asmlinkage int __kprobes __do_page_fault(struct pt_regs *regs,
 		pgd = pgd_offset_k(address);
 	} else {
 		if (unlikely(address >= TASK_SIZE || !current->mm))
-			goto out;
+			return 1;
 
 		pgd = pgd_offset(current->mm, address);
 	}
 
 	pud = pud_offset(pgd, address);
 	if (pud_none_or_clear_bad(pud))
-		goto out;
+		return 1;
 	pmd = pmd_offset(pud, address);
 	if (pmd_none_or_clear_bad(pmd))
-		goto out;
+		return 1;
 	pte = pte_offset_kernel(pmd, address);
 	entry = *pte;
 	if (unlikely(pte_none(entry) || pte_not_present(entry)))
-		goto out;
+		return 1;
 	if (unlikely(writeaccess && !pte_write(entry)))
-		goto out;
+		return 1;
 
 	if (writeaccess)
 		entry = pte_mkdirty(entry);
 	entry = pte_mkyoung(entry);
 
+	set_pte(pte, entry);
+
 #if defined(CONFIG_CPU_SH4) && !defined(CONFIG_SMP)
 	/*
-	 * ITLB is not affected by "ldtlb" instruction.
-	 * So, we need to flush the entry by ourselves.
+	 * SH-4 does not set MMUCR.RC to the corresponding TLB entry in
+	 * the case of an initial page write exception, so we need to
+	 * flush it in order to avoid potential TLB entry duplication.
 	 */
-	local_flush_tlb_one(get_asid(), address & PAGE_MASK);
+	if (writeaccess == 2)
+		local_flush_tlb_one(get_asid(), address & PAGE_MASK);
 #endif
 
-	set_pte(pte, entry);
 	update_mmu_cache(NULL, address, entry);
 
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
diff --git a/arch/sh/mm/fault_64.c b/arch/sh/mm/fault_64.c
index bd63b961b2a9..2b356cec2489 100644
--- a/arch/sh/mm/fault_64.c
+++ b/arch/sh/mm/fault_64.c
@@ -56,16 +56,7 @@ inline void __do_tlb_refill(unsigned long address,
 	/*
 	 * Set PTEH register
 	 */
-	pteh = address & MMU_VPN_MASK;
-
-	/* Sign extend based on neff. */
-#if (NEFF == 32)
-	/* Faster sign extension */
-	pteh = (unsigned long long)(signed long long)(signed long)pteh;
-#else
-	/* General case */
-	pteh = (pteh & NEFF_SIGN) ? (pteh | NEFF_MASK) : pteh;
-#endif
+	pteh = neff_sign_extend(address & MMU_VPN_MASK);
 
 	/* Set the ASID. */
 	pteh |= get_asid() << PTEH_ASID_SHIFT;
diff --git a/arch/sh/mm/flush-sh4.c b/arch/sh/mm/flush-sh4.c
new file mode 100644
index 000000000000..cef402678f42
--- /dev/null
+++ b/arch/sh/mm/flush-sh4.c
@@ -0,0 +1,108 @@
+#include <linux/mm.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+
+/*
+ * Write back the dirty D-caches, but not invalidate them.
+ *
+ * START: Virtual Address (U0, P1, or P3)
+ * SIZE: Size of the region.
+ */
+static void sh4__flush_wback_region(void *start, int size)
+{
+	reg_size_t aligned_start, v, cnt, end;
+
+	aligned_start = register_align(start);
+	v = aligned_start & ~(L1_CACHE_BYTES-1);
+	end = (aligned_start + size + L1_CACHE_BYTES-1)
+		& ~(L1_CACHE_BYTES-1);
+	cnt = (end - v) / L1_CACHE_BYTES;
+
+	while (cnt >= 8) {
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		cnt -= 8;
+	}
+
+	while (cnt) {
+		__ocbwb(v); v += L1_CACHE_BYTES;
+		cnt--;
+	}
+}
+
+/*
+ * Write back the dirty D-caches and invalidate them.
+ *
+ * START: Virtual Address (U0, P1, or P3)
+ * SIZE: Size of the region.
+ */
+static void sh4__flush_purge_region(void *start, int size)
+{
+	reg_size_t aligned_start, v, cnt, end;
+
+	aligned_start = register_align(start);
+	v = aligned_start & ~(L1_CACHE_BYTES-1);
+	end = (aligned_start + size + L1_CACHE_BYTES-1)
+		& ~(L1_CACHE_BYTES-1);
+	cnt = (end - v) / L1_CACHE_BYTES;
+
+	while (cnt >= 8) {
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		__ocbp(v); v += L1_CACHE_BYTES;
+		cnt -= 8;
+	}
+	while (cnt) {
+		__ocbp(v); v += L1_CACHE_BYTES;
+		cnt--;
+	}
+}
+
+/*
+ * No write back please
+ */
+static void sh4__flush_invalidate_region(void *start, int size)
+{
+	reg_size_t aligned_start, v, cnt, end;
+
+	aligned_start = register_align(start);
+	v = aligned_start & ~(L1_CACHE_BYTES-1);
+	end = (aligned_start + size + L1_CACHE_BYTES-1)
+		& ~(L1_CACHE_BYTES-1);
+	cnt = (end - v) / L1_CACHE_BYTES;
+
+	while (cnt >= 8) {
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		__ocbi(v); v += L1_CACHE_BYTES;
+		cnt -= 8;
+	}
+
+	while (cnt) {
+		__ocbi(v); v += L1_CACHE_BYTES;
+		cnt--;
+	}
+}
+
+void __init sh4__flush_region_init(void)
+{
+	__flush_wback_region		= sh4__flush_wback_region;
+	__flush_invalidate_region	= sh4__flush_invalidate_region;
+	__flush_purge_region		= sh4__flush_purge_region;
+}
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index fe532aeaa16d..8173e38afd38 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -106,27 +106,31 @@ void __init page_table_range_init(unsigned long start, unsigned long end,
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	int pgd_idx;
+	pte_t *pte;
+	int i, j, k;
 	unsigned long vaddr;
 
-	vaddr = start & PMD_MASK;
-	end = (end + PMD_SIZE - 1) & PMD_MASK;
-	pgd_idx = pgd_index(vaddr);
-	pgd = pgd_base + pgd_idx;
-
-	for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
-		BUG_ON(pgd_none(*pgd));
-		pud = pud_offset(pgd, 0);
-		BUG_ON(pud_none(*pud));
-		pmd = pmd_offset(pud, 0);
-
-		if (!pmd_present(*pmd)) {
-			pte_t *pte_table;
-			pte_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-			pmd_populate_kernel(&init_mm, pmd, pte_table);
+	vaddr = start;
+	i = __pgd_offset(vaddr);
+	j = __pud_offset(vaddr);
+	k = __pmd_offset(vaddr);
+	pgd = pgd_base + i;
+
+	for ( ; (i < PTRS_PER_PGD) && (vaddr != end); pgd++, i++) {
+		pud = (pud_t *)pgd;
+		for ( ; (j < PTRS_PER_PUD) && (vaddr != end); pud++, j++) {
+			pmd = (pmd_t *)pud;
+			for (; (k < PTRS_PER_PMD) && (vaddr != end); pmd++, k++) {
+				if (pmd_none(*pmd)) {
+					pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+					pmd_populate_kernel(&init_mm, pmd, pte);
+					BUG_ON(pte != pte_offset_kernel(pmd, 0));
+				}
+				vaddr += PMD_SIZE;
+			}
+			k = 0;
 		}
-
-		vaddr += PMD_SIZE;
+		j = 0;
 	}
 }
 #endif	/* CONFIG_MMU */
@@ -137,7 +141,7 @@ void __init page_table_range_init(unsigned long start, unsigned long end,
 void __init paging_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
-	unsigned long vaddr;
+	unsigned long vaddr, end;
 	int nid;
 
 	/* We don't need to map the kernel through the TLB, as
@@ -155,7 +159,8 @@ void __init paging_init(void)
 	 * pte's will be filled in by __set_fixmap().
 	 */
 	vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-	page_table_range_init(vaddr, 0, swapper_pg_dir);
+	end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+	page_table_range_init(vaddr, end, swapper_pg_dir);
 
 	kmap_coherent_init();
 
@@ -181,8 +186,6 @@ void __init paging_init(void)
 	set_fixmap_nocache(FIX_UNCACHED, __pa(&__uncached_start));
 }
 
-static struct kcore_list kcore_mem, kcore_vmalloc;
-
 void __init mem_init(void)
 {
 	int codesize, datasize, initsize;
@@ -210,6 +213,9 @@ void __init mem_init(void)
 			high_memory = node_high_memory;
 	}
 
+	/* Set this up early, so we can take care of the zero page */
+	cpu_cache_init();
+
 	/* clear the zero-page */
 	memset(empty_zero_page, 0, PAGE_SIZE);
 	__flush_wback_region(empty_zero_page, PAGE_SIZE);
@@ -218,20 +224,14 @@ void __init mem_init(void)
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
 
-	kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
-	kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
-		   VMALLOC_END - VMALLOC_START);
-
 	printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
 	       "%dk data, %dk init)\n",
-		(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+		nr_free_pages() << (PAGE_SHIFT-10),
 		num_physpages << (PAGE_SHIFT-10),
 		codesize >> 10,
 		datasize >> 10,
 		initsize >> 10);
 
-	p3_cache_init();
-
 	/* Initialize the vDSO */
 	vsyscall_init();
 }
diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c
index da2f4186f2cd..c3250614e3ae 100644
--- a/arch/sh/mm/ioremap_32.c
+++ b/arch/sh/mm/ioremap_32.c
@@ -57,14 +57,6 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	if (is_pci_memory_fixed_range(phys_addr, size))
 		return (void __iomem *)phys_addr;
 
-#if !defined(CONFIG_PMB_FIXED)
-	/*
-	 * Don't allow anybody to remap normal RAM that we're using..
-	 */
-	if (phys_addr < virt_to_phys(high_memory))
-		return NULL;
-#endif
-
 	/*
 	 * Mappings have to be page-aligned
 	 */
diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c
index 828c8597219d..b16843d02b76 100644
--- a/arch/sh/mm/ioremap_64.c
+++ b/arch/sh/mm/ioremap_64.c
@@ -94,7 +94,6 @@ static struct resource *shmedia_find_resource(struct resource *root,
 static void __iomem *shmedia_alloc_io(unsigned long phys, unsigned long size,
 				      const char *name, unsigned long flags)
 {
-	static int printed_full;
 	struct xresource *xres;
 	struct resource *res;
 	char *tack;
@@ -108,11 +107,8 @@ static void __iomem *shmedia_alloc_io(unsigned long phys, unsigned long size,
 		tack = xres->xname;
 		res = &xres->xres;
 	} else {
-		if (!printed_full) {
-			printk(KERN_NOTICE "%s: done with statics, "
+		printk_once(KERN_NOTICE "%s: done with statics, "
 			       "switching to kmalloc\n", __func__);
-			printed_full = 1;
-		}
 		tlen = strlen(name);
 		tack = kmalloc(sizeof(struct resource) + tlen + 1, GFP_KERNEL);
 		if (!tack)
diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c
new file mode 100644
index 000000000000..16e01b5fed04
--- /dev/null
+++ b/arch/sh/mm/kmap.c
@@ -0,0 +1,65 @@
+/*
+ * arch/sh/mm/kmap.c
+ *
+ * Copyright (C) 1999, 2000, 2002  Niibe Yutaka
+ * Copyright (C) 2002 - 2009  Paul Mundt
+ *
+ * Released under the terms of the GNU GPL v2.0.
+ */
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <asm/mmu_context.h>
+#include <asm/cacheflush.h>
+
+#define kmap_get_fixmap_pte(vaddr)                                     \
+	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr))
+
+static pte_t *kmap_coherent_pte;
+
+void __init kmap_coherent_init(void)
+{
+	unsigned long vaddr;
+
+	/* cache the first coherent kmap pte */
+	vaddr = __fix_to_virt(FIX_CMAP_BEGIN);
+	kmap_coherent_pte = kmap_get_fixmap_pte(vaddr);
+}
+
+void *kmap_coherent(struct page *page, unsigned long addr)
+{
+	enum fixed_addresses idx;
+	unsigned long vaddr;
+
+	BUG_ON(test_bit(PG_dcache_dirty, &page->flags));
+
+	pagefault_disable();
+
+	idx = FIX_CMAP_END -
+		((addr & current_cpu_data.dcache.alias_mask) >> PAGE_SHIFT);
+	vaddr = __fix_to_virt(idx);
+
+	BUG_ON(!pte_none(*(kmap_coherent_pte - idx)));
+	set_pte(kmap_coherent_pte - idx, mk_pte(page, PAGE_KERNEL));
+
+	return (void *)vaddr;
+}
+
+void kunmap_coherent(void *kvaddr)
+{
+	if (kvaddr >= (void *)FIXADDR_START) {
+		unsigned long vaddr = (unsigned long)kvaddr & PAGE_MASK;
+		enum fixed_addresses idx = __virt_to_fix(vaddr);
+
+		/* XXX.. Kill this later, here for sanity at the moment.. */
+		__flush_purge_region((void *)vaddr, PAGE_SIZE);
+
+		pte_clear(&init_mm, vaddr, kmap_coherent_pte - idx);
+		local_flush_tlb_one(get_asid(), vaddr);
+	}
+
+	pagefault_enable();
+}
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c
index 1b5fdfb4e0c2..d2984fa42d3d 100644
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -14,10 +14,10 @@
 #include <asm/page.h>
 #include <asm/processor.h>
 
-#ifdef CONFIG_MMU
 unsigned long shm_align_mask = PAGE_SIZE - 1;	/* Sane caches */
 EXPORT_SYMBOL(shm_align_mask);
 
+#ifdef CONFIG_MMU
 /*
  * To avoid cache aliases, we map the shared page with same color.
  */
diff --git a/arch/sh/mm/tlb-nommu.c b/arch/sh/mm/nommu.c
index 71c742b5aee3..ac16c05917ef 100644
--- a/arch/sh/mm/tlb-nommu.c
+++ b/arch/sh/mm/nommu.c
@@ -1,20 +1,41 @@
 /*
- * arch/sh/mm/tlb-nommu.c
+ * arch/sh/mm/nommu.c
  *
- * TLB Operations for MMUless SH.
+ * Various helper routines and stubs for MMUless SH.
  *
- * Copyright (C) 2002 Paul Mundt
+ * Copyright (C) 2002 - 2009 Paul Mundt
  *
  * Released under the terms of the GNU GPL v2.0.
  */
 #include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
 
 /*
  * Nothing too terribly exciting here ..
  */
+void copy_page(void *to, void *from)
+{
+	memcpy(to, from, PAGE_SIZE);
+}
+
+__kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n)
+{
+	memcpy(to, from, n);
+	return 0;
+}
+
+__kernel_size_t __clear_user(void *to, __kernel_size_t n)
+{
+	memset(to, 0, n);
+	return 0;
+}
+
 void local_flush_tlb_all(void)
 {
 	BUG();
@@ -46,8 +67,21 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	BUG();
 }
 
-void update_mmu_cache(struct vm_area_struct * vma,
-		      unsigned long address, pte_t pte)
+void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+{
+}
+
+void __init kmap_coherent_init(void)
+{
+}
+
+void *kmap_coherent(struct page *page, unsigned long addr)
+{
+	BUG();
+	return NULL;
+}
+
+void kunmap_coherent(void *kvaddr)
 {
 	BUG();
 }
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index 095d93bec7cd..9b784fdb947c 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -9,6 +9,7 @@
  */
 #include <linux/module.h>
 #include <linux/bootmem.h>
+#include <linux/lmb.h>
 #include <linux/mm.h>
 #include <linux/numa.h>
 #include <linux/pfn.h>
@@ -26,6 +27,15 @@ EXPORT_SYMBOL_GPL(node_data);
 void __init setup_memory(void)
 {
 	unsigned long free_pfn = PFN_UP(__pa(_end));
+	u64 base = min_low_pfn << PAGE_SHIFT;
+	u64 size = (max_low_pfn << PAGE_SHIFT) - min_low_pfn;
+
+	lmb_add(base, size);
+
+	/* Reserve the LMB regions used by the kernel, initrd, etc.. */
+	lmb_reserve(__MEMORY_START + CONFIG_ZERO_PAGE_OFFSET,
+		    (PFN_PHYS(free_pfn) + PAGE_SIZE - 1) -
+		    (__MEMORY_START + CONFIG_ZERO_PAGE_OFFSET));
 
 	/*
 	 * Node 0 sets up its pgdat at the first available pfn,
@@ -45,24 +55,23 @@ void __init setup_memory(void)
 
 void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
 {
-	unsigned long bootmap_pages, bootmap_start, bootmap_size;
-	unsigned long start_pfn, free_pfn, end_pfn;
+	unsigned long bootmap_pages;
+	unsigned long start_pfn, end_pfn;
+	unsigned long bootmem_paddr;
 
 	/* Don't allow bogus node assignment */
 	BUG_ON(nid > MAX_NUMNODES || nid == 0);
 
-	/*
-	 * The free pfn starts at the beginning of the range, and is
-	 * advanced as necessary for pgdat and node map allocations.
-	 */
-	free_pfn = start_pfn = start >> PAGE_SHIFT;
+	start_pfn = start >> PAGE_SHIFT;
 	end_pfn = end >> PAGE_SHIFT;
 
+	lmb_add(start, end - start);
+
 	__add_active_range(nid, start_pfn, end_pfn);
 
 	/* Node-local pgdat */
-	NODE_DATA(nid) = pfn_to_kaddr(free_pfn);
-	free_pfn += PFN_UP(sizeof(struct pglist_data));
+	NODE_DATA(nid) = __va(lmb_alloc_base(sizeof(struct pglist_data),
+					     SMP_CACHE_BYTES, end_pfn));
 	memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
 	NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
@@ -71,16 +80,17 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
 
 	/* Node-local bootmap */
 	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
-	bootmap_start = (unsigned long)pfn_to_kaddr(free_pfn);
-	bootmap_size = init_bootmem_node(NODE_DATA(nid), free_pfn, start_pfn,
-				    end_pfn);
+	bootmem_paddr = lmb_alloc_base(bootmap_pages << PAGE_SHIFT,
+				       PAGE_SIZE, end_pfn);
+	init_bootmem_node(NODE_DATA(nid), bootmem_paddr >> PAGE_SHIFT,
+			  start_pfn, end_pfn);
 
 	free_bootmem_with_active_regions(nid, end_pfn);
 
 	/* Reserve the pgdat and bootmap space with the bootmem allocator */
 	reserve_bootmem_node(NODE_DATA(nid), start_pfn << PAGE_SHIFT,
 			     sizeof(struct pglist_data), BOOTMEM_DEFAULT);
-	reserve_bootmem_node(NODE_DATA(nid), free_pfn << PAGE_SHIFT,
+	reserve_bootmem_node(NODE_DATA(nid), bootmem_paddr,
 			     bootmap_pages << PAGE_SHIFT, BOOTMEM_DEFAULT);
 
 	/* It's up */
diff --git a/arch/sh/mm/pg-nommu.c b/arch/sh/mm/pg-nommu.c
deleted file mode 100644
index 91ed4e695ff7..000000000000
--- a/arch/sh/mm/pg-nommu.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * arch/sh/mm/pg-nommu.c
- *
- * clear_page()/copy_page() implementation for MMUless SH.
- *
- * Copyright (C) 2003  Paul Mundt
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <asm/page.h>
-#include <asm/uaccess.h>
-
-void copy_page(void *to, void *from)
-{
-	memcpy(to, from, PAGE_SIZE);
-}
-
-void clear_page(void *to)
-{
-	memset(to, 0, PAGE_SIZE);
-}
-
-__kernel_size_t __copy_user(void *to, const void *from, __kernel_size_t n)
-{
-	memcpy(to, from, n);
-	return 0;
-}
-
-__kernel_size_t __clear_user(void *to, __kernel_size_t n)
-{
-	memset(to, 0, n);
-	return 0;
-}
diff --git a/arch/sh/mm/pg-sh4.c b/arch/sh/mm/pg-sh4.c
deleted file mode 100644
index 2fe14da1f839..000000000000
--- a/arch/sh/mm/pg-sh4.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * arch/sh/mm/pg-sh4.c
- *
- * Copyright (C) 1999, 2000, 2002  Niibe Yutaka
- * Copyright (C) 2002 - 2007  Paul Mundt
- *
- * Released under the terms of the GNU GPL v2.0.
- */
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/module.h>
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-
-#define CACHE_ALIAS (current_cpu_data.dcache.alias_mask)
-
-#define kmap_get_fixmap_pte(vaddr)                                     \
-	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)), (vaddr))
-
-static pte_t *kmap_coherent_pte;
-
-void __init kmap_coherent_init(void)
-{
-	unsigned long vaddr;
-
-	/* cache the first coherent kmap pte */
-	vaddr = __fix_to_virt(FIX_CMAP_BEGIN);
-	kmap_coherent_pte = kmap_get_fixmap_pte(vaddr);
-}
-
-static inline void *kmap_coherent(struct page *page, unsigned long addr)
-{
-	enum fixed_addresses idx;
-	unsigned long vaddr, flags;
-	pte_t pte;
-
-	inc_preempt_count();
-
-	idx = (addr & current_cpu_data.dcache.alias_mask) >> PAGE_SHIFT;
-	vaddr = __fix_to_virt(FIX_CMAP_END - idx);
-	pte = mk_pte(page, PAGE_KERNEL);
-
-	local_irq_save(flags);
-	flush_tlb_one(get_asid(), vaddr);
-	local_irq_restore(flags);
-
-	update_mmu_cache(NULL, vaddr, pte);
-
-	set_pte(kmap_coherent_pte - (FIX_CMAP_END - idx), pte);
-
-	return (void *)vaddr;
-}
-
-static inline void kunmap_coherent(struct page *page)
-{
-	dec_preempt_count();
-	preempt_check_resched();
-}
-
-/*
- * clear_user_page
- * @to: P1 address
- * @address: U0 address to be mapped
- * @page: page (virt_to_page(to))
- */
-void clear_user_page(void *to, unsigned long address, struct page *page)
-{
-	__set_bit(PG_mapped, &page->flags);
-
-	clear_page(to);
-	if ((((address & PAGE_MASK) ^ (unsigned long)to) & CACHE_ALIAS))
-		__flush_wback_region(to, PAGE_SIZE);
-}
-
-void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
-		       unsigned long vaddr, void *dst, const void *src,
-		       unsigned long len)
-{
-	void *vto;
-
-	__set_bit(PG_mapped, &page->flags);
-
-	vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
-	memcpy(vto, src, len);
-	kunmap_coherent(vto);
-
-	if (vma->vm_flags & VM_EXEC)
-		flush_cache_page(vma, vaddr, page_to_pfn(page));
-}
-
-void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
-			 unsigned long vaddr, void *dst, const void *src,
-			 unsigned long len)
-{
-	void *vfrom;
-
-	__set_bit(PG_mapped, &page->flags);
-
-	vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK);
-	memcpy(dst, vfrom, len);
-	kunmap_coherent(vfrom);
-}
-
-void copy_user_highpage(struct page *to, struct page *from,
-			unsigned long vaddr, struct vm_area_struct *vma)
-{
-	void *vfrom, *vto;
-
-	__set_bit(PG_mapped, &to->flags);
-
-	vto = kmap_atomic(to, KM_USER1);
-	vfrom = kmap_coherent(from, vaddr);
-	copy_page(vto, vfrom);
-	kunmap_coherent(vfrom);
-
-	if (((vaddr ^ (unsigned long)vto) & CACHE_ALIAS))
-		__flush_wback_region(vto, PAGE_SIZE);
-
-	kunmap_atomic(vto, KM_USER1);
-	/* Make sure this page is cleared on other CPU's too before using it */
-	smp_wmb();
-}
-EXPORT_SYMBOL(copy_user_highpage);
-
-/*
- * For SH-4, we have our own implementation for ptep_get_and_clear
- */
-pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-
-	pte_clear(mm, addr, ptep);
-	if (!pte_not_present(pte)) {
-		unsigned long pfn = pte_pfn(pte);
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			struct address_space *mapping = page_mapping(page);
-			if (!mapping || !mapping_writably_mapped(mapping))
-				__clear_bit(PG_mapped, &page->flags);
-		}
-	}
-	return pte;
-}
diff --git a/arch/sh/mm/pg-sh7705.c b/arch/sh/mm/pg-sh7705.c
deleted file mode 100644
index eaf25147194c..000000000000
--- a/arch/sh/mm/pg-sh7705.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * arch/sh/mm/pg-sh7705.c
- *
- * Copyright (C) 1999, 2000  Niibe Yutaka
- * Copyright (C) 2004  Alex Song
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- */
-
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/mm.h>
-#include <linux/threads.h>
-#include <linux/fs.h>
-#include <asm/addrspace.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/cache.h>
-#include <asm/io.h>
-#include <asm/uaccess.h>
-#include <asm/pgalloc.h>
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-
-static inline void __flush_purge_virtual_region(void *p1, void *virt, int size)
-{
-	unsigned long v;
-	unsigned long begin, end;
-	unsigned long p1_begin;
-
-
-	begin = L1_CACHE_ALIGN((unsigned long)virt);
-	end = L1_CACHE_ALIGN((unsigned long)virt + size);
-
-	p1_begin = (unsigned long)p1 & ~(L1_CACHE_BYTES - 1);
-
-	/* do this the slow way as we may not have TLB entries
-	 * for virt yet. */
-	for (v = begin; v < end; v += L1_CACHE_BYTES) {
-		unsigned long p;
-	        unsigned long ways, addr;
-
-		p = __pa(p1_begin);
-
-	        ways = current_cpu_data.dcache.ways;
-		addr = CACHE_OC_ADDRESS_ARRAY;
-
-		do {
-			unsigned long data;
-
-			addr |= (v & current_cpu_data.dcache.entry_mask);
-
-			data = ctrl_inl(addr);
-			if ((data & CACHE_PHYSADDR_MASK) ==
-			       (p & CACHE_PHYSADDR_MASK)) {
-				data &= ~(SH_CACHE_UPDATED|SH_CACHE_VALID);
-				ctrl_outl(data, addr);
-			}
-
-			addr += current_cpu_data.dcache.way_incr;
-		} while (--ways);
-
-		p1_begin += L1_CACHE_BYTES;
-	}
-}
-
-/*
- * clear_user_page
- * @to: P1 address
- * @address: U0 address to be mapped
- */
-void clear_user_page(void *to, unsigned long address, struct page *pg)
-{
-	struct page *page = virt_to_page(to);
-
-	__set_bit(PG_mapped, &page->flags);
-	if (((address ^ (unsigned long)to) & CACHE_ALIAS) == 0) {
-		clear_page(to);
-		__flush_wback_region(to, PAGE_SIZE);
-	} else {
-		__flush_purge_virtual_region(to,
-					     (void *)(address & 0xfffff000),
-					     PAGE_SIZE);
-		clear_page(to);
-		__flush_wback_region(to, PAGE_SIZE);
-	}
-}
-
-/*
- * copy_user_page
- * @to: P1 address
- * @from: P1 address
- * @address: U0 address to be mapped
- */
-void copy_user_page(void *to, void *from, unsigned long address, struct page *pg)
-{
-	struct page *page = virt_to_page(to);
-
-
-	__set_bit(PG_mapped, &page->flags);
-	if (((address ^ (unsigned long)to) & CACHE_ALIAS) == 0) {
-		copy_page(to, from);
-		__flush_wback_region(to, PAGE_SIZE);
-	} else {
-		__flush_purge_virtual_region(to,
-					     (void *)(address & 0xfffff000),
-					     PAGE_SIZE);
-		copy_page(to, from);
-		__flush_wback_region(to, PAGE_SIZE);
-	}
-}
-
-/*
- * For SH7705, we have our own implementation for ptep_get_and_clear
- * Copied from pg-sh4.c
- */
-pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-
-	pte_clear(mm, addr, ptep);
-	if (!pte_not_present(pte)) {
-		unsigned long pfn = pte_pfn(pte);
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			struct address_space *mapping = page_mapping(page);
-			if (!mapping || !mapping_writably_mapped(mapping))
-				__clear_bit(PG_mapped, &page->flags);
-		}
-	}
-
-	return pte;
-}
-
diff --git a/arch/sh/mm/tlb-pteaex.c b/arch/sh/mm/tlb-pteaex.c
index 2aab3ea934d7..409b7c2b4b9d 100644
--- a/arch/sh/mm/tlb-pteaex.c
+++ b/arch/sh/mm/tlb-pteaex.c
@@ -16,34 +16,16 @@
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
-void update_mmu_cache(struct vm_area_struct * vma,
-		      unsigned long address, pte_t pte)
+void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
-	unsigned long flags;
-	unsigned long pteval;
-	unsigned long vpn;
+	unsigned long flags, pteval, vpn;
 
-	/* Ptrace may call this routine. */
+	/*
+	 * Handle debugger faulting in for debugee.
+	 */
 	if (vma && current->active_mm != vma->vm_mm)
 		return;
 
-#ifndef CONFIG_CACHE_OFF
-	{
-		unsigned long pfn = pte_pfn(pte);
-
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-
-			if (!test_bit(PG_mapped, &page->flags)) {
-				unsigned long phys = pte_val(pte) & PTE_PHYS_MASK;
-				__flush_wback_region((void *)P1SEGADDR(phys),
-						     PAGE_SIZE);
-				__set_bit(PG_mapped, &page->flags);
-			}
-		}
-	}
-#endif
-
 	local_irq_save(flags);
 
 	/* Set PTEH register */
diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c
index 17cb7c3adf22..ace8e6d2f59d 100644
--- a/arch/sh/mm/tlb-sh3.c
+++ b/arch/sh/mm/tlb-sh3.c
@@ -27,32 +27,16 @@
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
-void update_mmu_cache(struct vm_area_struct * vma,
-		      unsigned long address, pte_t pte)
+void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
-	unsigned long flags;
-	unsigned long pteval;
-	unsigned long vpn;
+	unsigned long flags, pteval, vpn;
 
-	/* Ptrace may call this routine. */
+	/*
+	 * Handle debugger faulting in for debugee.
+	 */
 	if (vma && current->active_mm != vma->vm_mm)
 		return;
 
-#if defined(CONFIG_SH7705_CACHE_32KB)
-	{
-		struct page *page = pte_page(pte);
-		unsigned long pfn = pte_pfn(pte);
-
-		if (pfn_valid(pfn) && !test_bit(PG_mapped, &page->flags)) {
-			unsigned long phys = pte_val(pte) & PTE_PHYS_MASK;
-
-			__flush_wback_region((void *)P1SEGADDR(phys),
-					     PAGE_SIZE);
-			__set_bit(PG_mapped, &page->flags);
-		}
-	}
-#endif
-
 	local_irq_save(flags);
 
 	/* Set PTEH register */
@@ -93,4 +77,3 @@ void local_flush_tlb_one(unsigned long asid, unsigned long page)
 	for (i = 0; i < ways; i++)
 		ctrl_outl(data, addr + (i << 8));
 }
-
diff --git a/arch/sh/mm/tlb-sh4.c b/arch/sh/mm/tlb-sh4.c
index f0c7b7397fa6..8cf550e2570f 100644
--- a/arch/sh/mm/tlb-sh4.c
+++ b/arch/sh/mm/tlb-sh4.c
@@ -15,34 +15,16 @@
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 
-void update_mmu_cache(struct vm_area_struct * vma,
-		      unsigned long address, pte_t pte)
+void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
-	unsigned long flags;
-	unsigned long pteval;
-	unsigned long vpn;
+	unsigned long flags, pteval, vpn;
 
-	/* Ptrace may call this routine. */
+	/*
+	 * Handle debugger faulting in for debugee.
+	 */
 	if (vma && current->active_mm != vma->vm_mm)
 		return;
 
-#ifndef CONFIG_CACHE_OFF
-	{
-		unsigned long pfn = pte_pfn(pte);
-
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-
-			if (!test_bit(PG_mapped, &page->flags)) {
-				unsigned long phys = pte_val(pte) & PTE_PHYS_MASK;
-				__flush_wback_region((void *)P1SEGADDR(phys),
-						     PAGE_SIZE);
-				__set_bit(PG_mapped, &page->flags);
-			}
-		}
-	}
-#endif
-
 	local_irq_save(flags);
 
 	/* Set PTEH register */
@@ -61,9 +43,12 @@ void update_mmu_cache(struct vm_area_struct * vma,
 	 */
 	ctrl_outl(pte.pte_high, MMU_PTEA);
 #else
-	if (cpu_data->flags & CPU_HAS_PTEA)
-		/* TODO: make this look less hacky */
-		ctrl_outl(((pteval >> 28) & 0xe) | (pteval & 0x1), MMU_PTEA);
+	if (cpu_data->flags & CPU_HAS_PTEA) {
+		/* The last 3 bits and the first one of pteval contains
+		 * the PTEA timing control and space attribute bits
+		 */
+		ctrl_outl(copy_ptea_attributes(pteval), MMU_PTEA);
+	}
 #endif
 
 	/* Set PTEL register */
diff --git a/arch/sh/mm/tlb-sh5.c b/arch/sh/mm/tlb-sh5.c
index dae131243bcc..fdb64e41ec50 100644
--- a/arch/sh/mm/tlb-sh5.c
+++ b/arch/sh/mm/tlb-sh5.c
@@ -117,26 +117,15 @@ int sh64_put_wired_dtlb_entry(unsigned long long entry)
  * Load up a virtual<->physical translation for @eaddr<->@paddr in the
  * pre-allocated TLB slot @config_addr (see sh64_get_wired_dtlb_entry).
  */
-inline void sh64_setup_tlb_slot(unsigned long long config_addr,
-				unsigned long eaddr,
-				unsigned long asid,
-				unsigned long paddr)
+void sh64_setup_tlb_slot(unsigned long long config_addr, unsigned long eaddr,
+			 unsigned long asid, unsigned long paddr)
 {
 	unsigned long long pteh, ptel;
 
-	/* Sign extension */
-#if (NEFF == 32)
-	pteh = (unsigned long long)(signed long long)(signed long) eaddr;
-#else
-#error "Can't sign extend more than 32 bits yet"
-#endif
+	pteh = neff_sign_extend(eaddr);
 	pteh &= PAGE_MASK;
 	pteh |= (asid << PTEH_ASID_SHIFT) | PTEH_VALID;
-#if (NEFF == 32)
-	ptel = (unsigned long long)(signed long long)(signed long) paddr;
-#else
-#error "Can't sign extend more than 32 bits yet"
-#endif
+	ptel = neff_sign_extend(paddr);
 	ptel &= PAGE_MASK;
 	ptel |= (_PAGE_CACHABLE | _PAGE_READ | _PAGE_WRITE);
 
@@ -152,5 +141,5 @@ inline void sh64_setup_tlb_slot(unsigned long long config_addr,
  *
  * Teardown any existing mapping in the TLB slot @config_addr.
  */
-inline void sh64_teardown_tlb_slot(unsigned long long config_addr)
+void sh64_teardown_tlb_slot(unsigned long long config_addr)
 	__attribute__ ((alias("__flush_tlb_slot")));
diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c
index 3ce40ea34824..de0b0e881823 100644
--- a/arch/sh/mm/tlbflush_64.c
+++ b/arch/sh/mm/tlbflush_64.c
@@ -20,7 +20,7 @@
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
-#include <linux/perf_counter.h>
+#include <linux/perf_event.h>
 #include <linux/interrupt.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
 	/* Not an IO address, so reenable interrupts */
 	local_irq_enable();
 
-	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt or have no user
@@ -201,11 +201,11 @@ survive:
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
@@ -329,22 +329,6 @@ do_sigbus:
 		goto no_context;
 }
 
-void update_mmu_cache(struct vm_area_struct * vma,
-			unsigned long address, pte_t pte)
-{
-	/*
-	 * This appears to get called once for every pte entry that gets
-	 * established => I don't think it's efficient to try refilling the
-	 * TLBs with the pages - some may not get accessed even.  Also, for
-	 * executable pages, it is impossible to determine reliably here which
-	 * TLB they should be mapped into (or both even).
-	 *
-	 * So, just do nothing here and handle faults on demand.  In the
-	 * TLBMISS handling case, the refill is now done anyway after the pte
-	 * has been fixed up, so that deals with most useful cases.
-	 */
-}
-
 void local_flush_tlb_one(unsigned long asid, unsigned long page)
 {
 	unsigned long long match, pteh=0, lpage;
@@ -353,7 +337,7 @@ void local_flush_tlb_one(unsigned long asid, unsigned long page)
 	/*
 	 * Sign-extend based on neff.
 	 */
-	lpage = (page & NEFF_SIGN) ? (page | NEFF_MASK) : page;
+	lpage = neff_sign_extend(page);
 	match = (asid << PTEH_ASID_SHIFT) | PTEH_VALID;
 	match |= lpage;
 
@@ -482,3 +466,7 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
         /* FIXME: Optimize this later.. */
         flush_tlb_all();
 }
+
+void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte)
+{
+}