10 files changed, 141 insertions, 65 deletions
diff --git a/arch/arm/mach-ixp23xx/core.c b/arch/arm/mach-ixp23xx/core.c
index 092ee12ced42..affd1d5d7440 100644
--- a/arch/arm/mach-ixp23xx/core.c
+++ b/arch/arm/mach-ixp23xx/core.c
@@ -178,8 +178,12 @@ static int ixp23xx_irq_set_type(unsigned int irq, unsigned int type)
 
 static void ixp23xx_irq_mask(unsigned int irq)
 {
-	volatile unsigned long *intr_reg = IXP23XX_INTR_EN1 + (irq / 32);
+	volatile unsigned long *intr_reg;
 
+	if (irq >= 56)
+		irq += 8;
+
+	intr_reg = IXP23XX_INTR_EN1 + (irq / 32);
 	*intr_reg &= ~(1 << (irq % 32));
 }
 
@@ -199,17 +203,25 @@ static void ixp23xx_irq_ack(unsigned int irq)
  */
 static void ixp23xx_irq_level_unmask(unsigned int irq)
 {
-	volatile unsigned long *intr_reg = IXP23XX_INTR_EN1 + (irq / 32);
+	volatile unsigned long *intr_reg;
 
 	ixp23xx_irq_ack(irq);
 
+	if (irq >= 56)
+		irq += 8;
+
+	intr_reg = IXP23XX_INTR_EN1 + (irq / 32);
 	*intr_reg |= (1 << (irq % 32));
 }
 
 static void ixp23xx_irq_edge_unmask(unsigned int irq)
 {
-	volatile unsigned long *intr_reg = IXP23XX_INTR_EN1 + (irq / 32);
+	volatile unsigned long *intr_reg;
+
+	if (irq >= 56)
+		irq += 8;
 
+	intr_reg = IXP23XX_INTR_EN1 + (irq / 32);
 	*intr_reg |= (1 << (irq % 32));
 }
 
diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S
index 3eadac5e171e..31c5892f5acc 100644
--- a/arch/sparc64/kernel/head.S
+++ b/arch/sparc64/kernel/head.S
@@ -10,6 +10,7 @@
 #include <linux/config.h>
 #include <linux/version.h>
 #include <linux/errno.h>
+#include <linux/threads.h>
 #include <asm/thread_info.h>
 #include <asm/asi.h>
 #include <asm/pstate.h>
@@ -493,6 +494,35 @@ tlb_fixup_done:
 	call	prom_init
 	 mov	%l7, %o0			! OpenPROM cif handler
 
+	/* Initialize current_thread_info()->cpu as early as possible.
+	 * In order to do that accurately we have to patch up the get_cpuid()
+	 * assembler sequences.  And that, in turn, requires that we know
+	 * if we are on a Starfire box or not.  While we're here, patch up
+	 * the sun4v sequences as well.
+	 */
+	call	check_if_starfire
+	 nop
+	call	per_cpu_patch
+	 nop
+	call	sun4v_patch
+	 nop
+
+#ifdef CONFIG_SMP
+	call	hard_smp_processor_id
+	 nop
+	cmp	%o0, NR_CPUS
+	blu,pt	%xcc, 1f
+	 nop
+	call	boot_cpu_id_too_large
+	 nop
+	/* Not reached... */
+
+1:
+#else
+	mov	0, %o0
+#endif
+	stb	%o0, [%g6 + TI_CPU]
+
 	/* Off we go.... */
 	call	start_kernel
 	 nop
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index 005167f82419..9cf1c88cd774 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -220,7 +220,7 @@ char reboot_command[COMMAND_LINE_SIZE];
 
 static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 };
 
-static void __init per_cpu_patch(void)
+void __init per_cpu_patch(void)
 {
 	struct cpuid_patch_entry *p;
 	unsigned long ver;
@@ -280,7 +280,7 @@ static void __init per_cpu_patch(void)
 	}
 }
 
-static void __init sun4v_patch(void)
+void __init sun4v_patch(void)
 {
 	struct sun4v_1insn_patch_entry *p1;
 	struct sun4v_2insn_patch_entry *p2;
@@ -315,6 +315,15 @@ static void __init sun4v_patch(void)
 	}
 }
 
+#ifdef CONFIG_SMP
+void __init boot_cpu_id_too_large(int cpu)
+{
+	prom_printf("Serious problem, boot cpu id (%d) >= NR_CPUS (%d)\n",
+		    cpu, NR_CPUS);
+	prom_halt();
+}
+#endif
+
 void __init setup_arch(char **cmdline_p)
 {
 	/* Initialize PROM console and command line. */
@@ -332,16 +341,6 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &prom_con;
 #endif
 
-	/* Work out if we are starfire early on */
-	check_if_starfire();
-
-	/* Now we know enough to patch the get_cpuid sequences
-	 * used by trap code.
-	 */
-	per_cpu_patch();
-
-	sun4v_patch();
-
 	boot_flags_init(*cmdline_p);
 
 	idprom_init();
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 90eaca3ec9a6..4e8cd79156e0 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1264,7 +1264,6 @@ void __init smp_tick_init(void)
 	boot_cpu_id = hard_smp_processor_id();
 	current_tick_offset = timer_tick_offset;
 
-	cpu_set(boot_cpu_id, cpu_online_map);
 	prof_counter(boot_cpu_id) = prof_multiplier(boot_cpu_id) = 1;
 }
 
@@ -1345,18 +1344,6 @@ void __init smp_setup_cpu_possible_map(void)
 
 void __devinit smp_prepare_boot_cpu(void)
 {
-	int cpu = hard_smp_processor_id();
-
-	if (cpu >= NR_CPUS) {
-		prom_printf("Serious problem, boot cpu id >= NR_CPUS\n");
-		prom_halt();
-	}
-
-	current_thread_info()->cpu = cpu;
-	__local_per_cpu_offset = __per_cpu_offset(cpu);
-
-	cpu_set(smp_processor_id(), cpu_online_map);
-	cpu_set(smp_processor_id(), phys_cpu_present_map);
 }
 
 int __devinit __cpu_up(unsigned int cpu)
@@ -1433,4 +1420,7 @@ void __init setup_per_cpu_areas(void)
 
 	for (i = 0; i < NR_CPUS; i++, ptr += size)
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+	/* Setup %g5 for the boot cpu.  */
+	__local_per_cpu_offset = __per_cpu_offset(smp_processor_id());
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 11ce6aaf1bd0..8e9d84825e1c 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -133,6 +133,7 @@ struct cfq_data {
 	mempool_t *crq_pool;
 
 	int rq_in_driver;
+	int hw_tag;
 
 	/*
 	 * schedule slice state info
@@ -500,10 +501,13 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 
 	/*
 	 * if queue was preempted, just add to front to be fair. busy_rr
-	 * isn't sorted.
+	 * isn't sorted, but insert at the back for fairness.
 	 */
 	if (preempted || list == &cfqd->busy_rr) {
-		list_add(&cfqq->cfq_list, list);
+		if (preempted)
+			list = list->prev;
+
+		list_add_tail(&cfqq->cfq_list, list);
 		return;
 	}
 
@@ -664,6 +668,15 @@ static void cfq_activate_request(request_queue_t *q, struct request *rq)
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
 	cfqd->rq_in_driver++;
+
+	/*
+	 * If the depth is larger 1, it really could be queueing. But lets
+	 * make the mark a little higher - idling could still be good for
+	 * low queueing, and a low queueing number could also just indicate
+	 * a SCSI mid layer like behaviour where limit+1 is often seen.
+	 */
+	if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
+		cfqd->hw_tag = 1;
 }
 
 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
@@ -879,6 +892,13 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
 
 	/*
+	 * If no new queues are available, check if the busy list has some
+	 * before falling back to idle io.
+	 */
+	if (!cfqq && !list_empty(&cfqd->busy_rr))
+		cfqq = list_entry_cfqq(cfqd->busy_rr.next);
+
+	/*
 	 * if we have idle queues and no rt or be queues had pending
 	 * requests, either allow immediate service if the grace period
 	 * has passed or arm the idle grace timer
@@ -1458,7 +1478,8 @@ retry:
 		 * set ->slice_left to allow preemption for a new process
 		 */
 		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
-		cfq_mark_cfqq_idle_window(cfqq);
+		if (!cfqd->hw_tag)
+			cfq_mark_cfqq_idle_window(cfqq);
 		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_init_prio_data(cfqq);
 	}
@@ -1649,7 +1670,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
 	int enable_idle = cfq_cfqq_idle_window(cfqq);
 
-	if (!cic->ioc->task || !cfqd->cfq_slice_idle)
+	if (!cic->ioc->task || !cfqd->cfq_slice_idle || cfqd->hw_tag)
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
@@ -1740,14 +1761,24 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 
 	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 
+	cic = crq->io_context;
+
 	/*
 	 * we never wait for an async request and we don't allow preemption
 	 * of an async request. so just return early
 	 */
-	if (!cfq_crq_is_sync(crq))
+	if (!cfq_crq_is_sync(crq)) {
+		/*
+		 * sync process issued an async request, if it's waiting
+		 * then expire it and kick rq handling.
+		 */
+		if (cic == cfqd->active_cic &&
+		    del_timer(&cfqd->idle_slice_timer)) {
+			cfq_slice_expired(cfqd, 0);
+			cfq_start_queueing(cfqd, cfqq);
+		}
 		return;
-
-	cic = crq->io_context;
+	}
 
 	cfq_update_io_thinktime(cfqd, cic);
 	cfq_update_io_seektime(cfqd, cic, crq);
@@ -2165,10 +2196,9 @@ static void cfq_idle_class_timer(unsigned long data)
 	 * race with a non-idle queue, reset timer
 	 */
 	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
-	if (!time_after_eq(jiffies, end)) {
-		cfqd->idle_class_timer.expires = end;
-		add_timer(&cfqd->idle_class_timer);
-	} else
+	if (!time_after_eq(jiffies, end))
+		mod_timer(&cfqd->idle_class_timer, end);
+	else
 		cfq_schedule_dispatch(cfqd);
 
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
diff --git a/include/asm-arm/arch-ixp23xx/memory.h b/include/asm-arm/arch-ixp23xx/memory.h
index 6e19f46d54d1..c85fc06a043c 100644
--- a/include/asm-arm/arch-ixp23xx/memory.h
+++ b/include/asm-arm/arch-ixp23xx/memory.h
@@ -49,7 +49,7 @@ static inline int __ixp23xx_arch_is_coherent(void)
 {
 	extern unsigned int processor_id;
 
-	if (((processor_id & 15) >= 2) || machine_is_roadrunner())
+	if (((processor_id & 15) >= 4) || machine_is_roadrunner())
 		return 1;
 
 	return 0;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 358e4d309ceb..c2059a3a0621 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -159,17 +159,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 #define lazy_mmu_prot_update(pte)	do { } while (0)
 #endif
 
-#ifndef __HAVE_ARCH_MULTIPLE_ZERO_PAGE
+#ifndef __HAVE_ARCH_MOVE_PTE
 #define move_pte(pte, prot, old_addr, new_addr)	(pte)
-#else
-#define move_pte(pte, prot, old_addr, new_addr)				\
-({									\
- 	pte_t newpte = (pte);						\
-	if (pte_present(pte) && pfn_valid(pte_pfn(pte)) &&		\
-			pte_page(pte) == ZERO_PAGE(old_addr))		\
-		newpte = mk_pte(ZERO_PAGE(new_addr), (prot));		\
-	newpte;								\
-})
 #endif
 
 /*
diff --git a/include/asm-mips/pgtable.h b/include/asm-mips/pgtable.h
index 174a3cda8c26..f80fe75c7800 100644
--- a/include/asm-mips/pgtable.h
+++ b/include/asm-mips/pgtable.h
@@ -70,7 +70,15 @@ extern unsigned long zero_page_mask;
 #define ZERO_PAGE(vaddr) \
 	(virt_to_page(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))
 
-#define __HAVE_ARCH_MULTIPLE_ZERO_PAGE
+#define __HAVE_ARCH_MOVE_PTE
+#define move_pte(pte, prot, old_addr, new_addr)				\
+({									\
+ 	pte_t newpte = (pte);						\
+	if (pte_present(pte) && pfn_valid(pte_pfn(pte)) &&		\
+			pte_page(pte) == ZERO_PAGE(old_addr))		\
+		newpte = mk_pte(ZERO_PAGE(new_addr), (prot));		\
+	newpte;								\
+})
 
 extern void paging_init(void);
 
diff --git a/include/asm-sparc64/pgtable.h b/include/asm-sparc64/pgtable.h
index c44e7466534e..cd464f469a2c 100644
--- a/include/asm-sparc64/pgtable.h
+++ b/include/asm-sparc64/pgtable.h
@@ -689,6 +689,23 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *p
 #define pte_clear(mm,addr,ptep)		\
 	set_pte_at((mm), (addr), (ptep), __pte(0UL))
 
+#ifdef DCACHE_ALIASING_POSSIBLE
+#define __HAVE_ARCH_MOVE_PTE
+#define move_pte(pte, prot, old_addr, new_addr)				\
+({									\
+ 	pte_t newpte = (pte);						\
+	if (tlb_type != hypervisor && pte_present(pte)) {		\
+		unsigned long this_pfn = pte_pfn(pte);			\
+									\
+		if (pfn_valid(this_pfn) &&				\
+		    (((old_addr) ^ (new_addr)) & (1 << 13)))		\
+			flush_dcache_page_all(current->mm,		\
+					      pfn_to_page(this_pfn));	\
+	}								\
+	newpte;								\
+})
+#endif
+
 extern pgd_t swapper_pg_dir[2048];
 extern pmd_t swapper_low_pmd_dir[2048];
 
diff --git a/mm/slab.c b/mm/slab.c
index d31a06bfbea5..f1b644eb39d8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -207,11 +207,6 @@ typedef unsigned int kmem_bufctl_t;
 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
 
-/* Max number of objs-per-slab for caches which use off-slab slabs.
- * Needed to avoid a possible looping condition in cache_grow().
- */
-static unsigned long offslab_limit;
-
 /*
  * struct slab
  *
@@ -1356,12 +1351,6 @@ void __init kmem_cache_init(void)
 					NULL, NULL);
 		}
 
-		/* Inc off-slab bufctl limit until the ceiling is hit. */
-		if (!(OFF_SLAB(sizes->cs_cachep))) {
-			offslab_limit = sizes->cs_size - sizeof(struct slab);
-			offslab_limit /= sizeof(kmem_bufctl_t);
-		}
-
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
@@ -1780,6 +1769,7 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
+	unsigned long offslab_limit;
 	size_t left_over = 0;
 	int gfporder;
 
@@ -1791,9 +1781,18 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 		if (!num)
 			continue;
 
-		/* More than offslab_limit objects will cause problems */
-		if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
-			break;
+		if (flags & CFLGS_OFF_SLAB) {
+			/*
+			 * Max number of objs-per-slab for caches which
+			 * use off-slab slabs. Needed to avoid a possible
+			 * looping condition in cache_grow().
+			 */
+			offslab_limit = size - sizeof(struct slab);
+			offslab_limit /= sizeof(kmem_bufctl_t);
+
+ 			if (num > offslab_limit)
+				break;
+		}
 
 		/* Found something acceptable - save it away */
 		cachep->num = num;