summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/Makefile3
-rw-r--r--arch/powerpc/mm/fault.c25
-rw-r--r--arch/powerpc/mm/gup.c280
-rw-r--r--arch/powerpc/mm/hash_utils_64.c114
-rw-r--r--arch/powerpc/mm/hugetlbpage.c351
-rw-r--r--arch/powerpc/mm/init_64.c41
-rw-r--r--arch/powerpc/mm/mem.c41
-rw-r--r--arch/powerpc/mm/numa.c3
-rw-r--r--arch/powerpc/mm/pgtable_32.c22
-rw-r--r--arch/powerpc/mm/pgtable_64.c16
-rw-r--r--arch/powerpc/mm/ppc_mmu_32.c4
-rw-r--r--arch/powerpc/mm/tlb_64.c4
12 files changed, 672 insertions, 232 deletions
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 1c00e0196f6c..e7392b45a5ef 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -12,7 +12,8 @@ obj-y := fault.o mem.o \
mmu_context_$(CONFIG_WORD_SIZE).o
hash-$(CONFIG_PPC_NATIVE) := hash_native_64.o
obj-$(CONFIG_PPC64) += hash_utils_64.o \
- slb_low.o slb.o stab.o mmap.o $(hash-y)
+ slb_low.o slb.o stab.o \
+ gup.o mmap.o $(hash-y)
obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o
obj-$(CONFIG_PPC_STD_MMU) += hash_low_$(CONFIG_WORD_SIZE).o \
tlb_$(CONFIG_WORD_SIZE).o
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 1707d00331fc..565b7a237c84 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs)
return 0;
}
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- siginfo_t info;
-
- if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
- 11, SIGSEGV) == NOTIFY_STOP)
- return;
-
- if (debugger_dabr_match(regs))
- return;
-
- /* Clear the DABR */
- set_dabr(0);
-
- /* Deliver the signal to userspace */
- info.si_signo = SIGTRAP;
- info.si_errno = 0;
- info.si_code = TRAP_HWBKPT;
- info.si_addr = (void __user *)address;
- force_sig_info(SIGTRAP, &info, current);
-}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
-
/*
* For 600- and 800-family processors, the error_code parameter is DSISR
* for a data fault, SRR1 for an instruction fault. For 400-family processors
diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c
new file mode 100644
index 000000000000..9fdf4d6335e4
--- /dev/null
+++ b/arch/powerpc/mm/gup.c
@@ -0,0 +1,280 @@
+/*
+ * Lockless get_user_pages_fast for powerpc
+ *
+ * Copyright (C) 2008 Nick Piggin
+ * Copyright (C) 2008 Novell Inc.
+ */
+#undef DEBUG
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/vmstat.h>
+#include <linux/pagemap.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
+/*
+ * The performance critical leaf functions are made noinline otherwise gcc
+ * inlines everything into a single function which results in too much
+ * register pressure.
+ */
+static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ unsigned long mask, result;
+ pte_t *ptep;
+
+ result = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ result |= _PAGE_RW;
+ mask = result | _PAGE_SPECIAL;
+
+ ptep = pte_offset_kernel(&pmd, addr);
+ do {
+ pte_t pte = *ptep;
+ struct page *page;
+
+ if ((pte_val(pte) & mask) != result)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+ if (!page_cache_get_speculative(page))
+ return 0;
+ if (unlikely(pte != *ptep)) {
+ put_page(page);
+ return 0;
+ }
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ return 1;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
+ unsigned long *addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long mask;
+ unsigned long pte_end;
+ struct page *head, *page;
+ pte_t pte;
+ int refs;
+
+ pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = *ptep;
+ mask = _PAGE_PRESENT|_PAGE_USER;
+ if (write)
+ mask |= _PAGE_RW;
+ if ((pte_val(pte) & mask) != mask)
+ return 0;
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+ page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (*addr += PAGE_SIZE, *addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+ if (unlikely(pte != *ptep)) {
+ /* Could be optimized better */
+ while (*nr) {
+ put_page(page);
+ (*nr)--;
+ }
+ }
+
+ return 1;
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = *pmdp;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd))
+ return 0;
+ if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(&pgd, addr);
+ do {
+ pud_t pud = *pudp;
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next;
+ pgd_t *pgdp;
+ int psize, nr = 0;
+ unsigned int shift;
+
+ pr_debug("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, len)))
+ goto slow_irqon;
+
+ pr_debug(" aligned: %lx .. %lx\n", start, end);
+
+#ifdef CONFIG_HUGETLB_PAGE
+ /* We bail out on slice boundary crossing when hugetlb is
+ * enabled in order to not have to deal with two different
+ * page table formats
+ */
+ if (addr < SLICE_LOW_TOP) {
+ if (end > SLICE_LOW_TOP)
+ goto slow_irqon;
+
+ if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
+ GET_LOW_SLICE_INDEX(end - 1)))
+ goto slow_irqon;
+ } else {
+ if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
+ GET_HIGH_SLICE_INDEX(end - 1)))
+ goto slow_irqon;
+ }
+#endif /* CONFIG_HUGETLB_PAGE */
+
+ /*
+ * XXX: batch / limit 'nr', to avoid large irq off latency
+ * needs some instrumenting to determine the common sizes used by
+ * important workloads (eg. DB2), and whether limiting the batch size
+ * will decrease performance.
+ *
+ * It seems like we're in the clear for the moment. Direct-IO is
+ * the main guy that batches up lots of get_user_pages, and even
+ * they are limited to 64-at-a-time which is not so many.
+ */
+ /*
+ * This doesn't prevent pagetable teardown, but does prevent
+ * the pagetables from being freed on powerpc.
+ *
+ * So long as we atomically load page table pointers versus teardown,
+ * we can follow the address down to the the page and take a ref on it.
+ */
+ local_irq_disable();
+
+ psize = get_slice_psize(mm, addr);
+ shift = mmu_psize_defs[psize].shift;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (unlikely(mmu_huge_psizes[psize])) {
+ pte_t *ptep;
+ unsigned long a = addr;
+ unsigned long sz = ((1UL) << shift);
+ struct hstate *hstate = size_to_hstate(sz);
+
+ BUG_ON(!hstate);
+ /*
+ * XXX: could be optimized to avoid hstate
+ * lookup entirely (just use shift)
+ */
+
+ do {
+ VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
+ ptep = huge_pte_offset(mm, a);
+ pr_debug(" %016lx: huge ptep %p\n", a, ptep);
+ if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
+ &nr))
+ goto slow;
+ } while (a != end);
+ } else
+#endif /* CONFIG_HUGETLB_PAGE */
+ {
+ pgdp = pgd_offset(mm, addr);
+ do {
+ pgd_t pgd = *pgdp;
+
+ VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
+ pr_debug(" %016lx: normal pgd %p\n", addr, (void *)pgd);
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(pgd))
+ goto slow;
+ if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+ goto slow;
+ } while (pgdp++, addr = next, addr != end);
+ }
+ local_irq_enable();
+
+ VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
+ return nr;
+
+ {
+ int ret;
+
+slow:
+ local_irq_enable();
+slow_irqon:
+ pr_debug(" slow path ! nr = %d\n", nr);
+
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+
+ return ret;
+ }
+}
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 8d3b58ebd38e..14be408dfc9b 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -68,6 +68,7 @@
#define KB (1024)
#define MB (1024*KB)
+#define GB (1024L*MB)
/*
* Note: pte --> Linux PTE
@@ -102,7 +103,6 @@ int mmu_kernel_ssize = MMU_SEGSIZE_256M;
int mmu_highuser_ssize = MMU_SEGSIZE_256M;
u16 mmu_slb_size = 64;
#ifdef CONFIG_HUGETLB_PAGE
-int mmu_huge_psize = MMU_PAGE_16M;
unsigned int HPAGE_SHIFT;
#endif
#ifdef CONFIG_PPC_64K_PAGES
@@ -151,39 +151,53 @@ static struct mmu_psize_def mmu_psize_defaults_gp[] = {
},
};
+static unsigned long htab_convert_pte_flags(unsigned long pteflags)
+{
+ unsigned long rflags = pteflags & 0x1fa;
+
+ /* _PAGE_EXEC -> NOEXEC */
+ if ((pteflags & _PAGE_EXEC) == 0)
+ rflags |= HPTE_R_N;
+
+ /* PP bits. PAGE_USER is already PP bit 0x2, so we only
+ * need to add in 0x1 if it's a read-only user page
+ */
+ if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
+ (pteflags & _PAGE_DIRTY)))
+ rflags |= 1;
+
+ /* Always add C */
+ return rflags | HPTE_R_C;
+}
int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
- unsigned long pstart, unsigned long mode,
+ unsigned long pstart, unsigned long prot,
int psize, int ssize)
{
unsigned long vaddr, paddr;
unsigned int step, shift;
- unsigned long tmp_mode;
int ret = 0;
shift = mmu_psize_defs[psize].shift;
step = 1 << shift;
+ prot = htab_convert_pte_flags(prot);
+
+ DBG("htab_bolt_mapping(%lx..%lx -> %lx (%lx,%d,%d)\n",
+ vstart, vend, pstart, prot, psize, ssize);
+
for (vaddr = vstart, paddr = pstart; vaddr < vend;
vaddr += step, paddr += step) {
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, ssize);
unsigned long va = hpt_va(vaddr, vsid, ssize);
- tmp_mode = mode;
-
- /* Make non-kernel text non-executable */
- if (!in_kernel_text(vaddr))
- tmp_mode = mode | HPTE_R_N;
-
hash = hpt_hash(va, shift, ssize);
hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
- DBG("htab_bolt_mapping: calling %p\n", ppc_md.hpte_insert);
-
BUG_ON(!ppc_md.hpte_insert);
- ret = ppc_md.hpte_insert(hpteg, va, paddr,
- tmp_mode, HPTE_V_BOLTED, psize, ssize);
+ ret = ppc_md.hpte_insert(hpteg, va, paddr, prot,
+ HPTE_V_BOLTED, psize, ssize);
if (ret < 0)
break;
@@ -329,6 +343,44 @@ static int __init htab_dt_scan_page_sizes(unsigned long node,
return 0;
}
+/* Scan for 16G memory blocks that have been set aside for huge pages
+ * and reserve those blocks for 16G huge pages.
+ */
+static int __init htab_dt_scan_hugepage_blocks(unsigned long node,
+ const char *uname, int depth,
+ void *data) {
+ char *type = of_get_flat_dt_prop(node, "device_type", NULL);
+ unsigned long *addr_prop;
+ u32 *page_count_prop;
+ unsigned int expected_pages;
+ long unsigned int phys_addr;
+ long unsigned int block_size;
+
+ /* We are scanning "memory" nodes only */
+ if (type == NULL || strcmp(type, "memory") != 0)
+ return 0;
+
+ /* This property is the log base 2 of the number of virtual pages that
+ * will represent this memory block. */
+ page_count_prop = of_get_flat_dt_prop(node, "ibm,expected#pages", NULL);
+ if (page_count_prop == NULL)
+ return 0;
+ expected_pages = (1 << page_count_prop[0]);
+ addr_prop = of_get_flat_dt_prop(node, "reg", NULL);
+ if (addr_prop == NULL)
+ return 0;
+ phys_addr = addr_prop[0];
+ block_size = addr_prop[1];
+ if (block_size != (16 * GB))
+ return 0;
+ printk(KERN_INFO "Huge page(16GB) memory: "
+ "addr = 0x%lX size = 0x%lX pages = %d\n",
+ phys_addr, block_size, expected_pages);
+ lmb_reserve(phys_addr, block_size * expected_pages);
+ add_gpage(phys_addr, block_size, expected_pages);
+ return 0;
+}
+
static void __init htab_init_page_sizes(void)
{
int rc;
@@ -418,15 +470,18 @@ static void __init htab_init_page_sizes(void)
);
#ifdef CONFIG_HUGETLB_PAGE
- /* Init large page size. Currently, we pick 16M or 1M depending
+ /* Reserve 16G huge page memory sections for huge pages */
+ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
+
+/* Set default large page size. Currently, we pick 16M or 1M depending
* on what is available
*/
if (mmu_psize_defs[MMU_PAGE_16M].shift)
- set_huge_psize(MMU_PAGE_16M);
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
/* With 4k/4level pagetables, we can't (for now) cope with a
* huge page size < PMD_SIZE */
else if (mmu_psize_defs[MMU_PAGE_1M].shift)
- set_huge_psize(MMU_PAGE_1M);
+ HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
#endif /* CONFIG_HUGETLB_PAGE */
}
@@ -478,9 +533,9 @@ static unsigned long __init htab_get_table_size(void)
#ifdef CONFIG_MEMORY_HOTPLUG
void create_section_mapping(unsigned long start, unsigned long end)
{
- BUG_ON(htab_bolt_mapping(start, end, __pa(start),
- _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX,
- mmu_linear_psize, mmu_kernel_ssize));
+ BUG_ON(htab_bolt_mapping(start, end, __pa(start),
+ PAGE_KERNEL, mmu_linear_psize,
+ mmu_kernel_ssize));
}
int remove_section_mapping(unsigned long start, unsigned long end)
@@ -529,7 +584,7 @@ void __init htab_initialize(void)
{
unsigned long table;
unsigned long pteg_count;
- unsigned long mode_rw;
+ unsigned long prot, tprot;
unsigned long base = 0, size = 0, limit;
int i;
@@ -587,7 +642,7 @@ void __init htab_initialize(void)
mtspr(SPRN_SDR1, _SDR1);
}
- mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
+ prot = PAGE_KERNEL;
#ifdef CONFIG_DEBUG_PAGEALLOC
linear_map_hash_count = lmb_end_of_DRAM() >> PAGE_SHIFT;
@@ -605,8 +660,10 @@ void __init htab_initialize(void)
for (i=0; i < lmb.memory.cnt; i++) {
base = (unsigned long)__va(lmb.memory.region[i].base);
size = lmb.memory.region[i].size;
+ tprot = prot | (in_kernel_text(base) ? _PAGE_EXEC : 0);
- DBG("creating mapping for region: %lx : %lx\n", base, size);
+ DBG("creating mapping for region: %lx..%lx (prot: %x)\n",
+ base, size, tprot);
#ifdef CONFIG_U3_DART
/* Do not map the DART space. Fortunately, it will be aligned
@@ -623,21 +680,21 @@ void __init htab_initialize(void)
unsigned long dart_table_end = dart_tablebase + 16 * MB;
if (base != dart_tablebase)
BUG_ON(htab_bolt_mapping(base, dart_tablebase,
- __pa(base), mode_rw,
+ __pa(base), tprot,
mmu_linear_psize,
mmu_kernel_ssize));
if ((base + size) > dart_table_end)
BUG_ON(htab_bolt_mapping(dart_tablebase+16*MB,
base + size,
__pa(dart_table_end),
- mode_rw,
+ tprot,
mmu_linear_psize,
mmu_kernel_ssize));
continue;
}
#endif /* CONFIG_U3_DART */
BUG_ON(htab_bolt_mapping(base, base + size, __pa(base),
- mode_rw, mmu_linear_psize, mmu_kernel_ssize));
+ tprot, mmu_linear_psize, mmu_kernel_ssize));
}
/*
@@ -655,7 +712,7 @@ void __init htab_initialize(void)
tce_alloc_start = base + size + 1;
BUG_ON(htab_bolt_mapping(tce_alloc_start, tce_alloc_end,
- __pa(tce_alloc_start), mode_rw,
+ __pa(tce_alloc_start), prot,
mmu_linear_psize, mmu_kernel_ssize));
}
@@ -847,7 +904,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
#ifdef CONFIG_HUGETLB_PAGE
/* Handle hugepage regions */
- if (HPAGE_SHIFT && psize == mmu_huge_psize) {
+ if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
DBG_LOW(" -> huge page !\n");
return hash_huge_page(mm, access, ea, vsid, local, trap);
}
@@ -1076,8 +1133,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
unsigned long hash, hpteg;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
unsigned long va = hpt_va(vaddr, vsid, mmu_kernel_ssize);
- unsigned long mode = _PAGE_ACCESSED | _PAGE_DIRTY |
- _PAGE_COHERENT | PP_RWXX | HPTE_R_N;
+ unsigned long mode = htab_convert_pte_flags(PAGE_KERNEL);
int ret;
hash = hpt_hash(va, PAGE_SHIFT, mmu_kernel_ssize);
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 0d12fba31bc5..f1c2d55b4377 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -24,21 +24,43 @@
#include <asm/cputable.h>
#include <asm/spu.h>
-#define HPAGE_SHIFT_64K 16
-#define HPAGE_SHIFT_16M 24
+#define PAGE_SHIFT_64K 16
+#define PAGE_SHIFT_16M 24
+#define PAGE_SHIFT_16G 34
#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
+#define MAX_NUMBER_GPAGES 1024
-unsigned int hugepte_shift;
-#define PTRS_PER_HUGEPTE (1 << hugepte_shift)
-#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << hugepte_shift)
+/* Tracks the 16G pages after the device tree is scanned and before the
+ * huge_boot_pages list is ready. */
+static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
+static unsigned nr_gpages;
-#define HUGEPD_SHIFT (HPAGE_SHIFT + hugepte_shift)
-#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
-#define HUGEPD_MASK (~(HUGEPD_SIZE-1))
+/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
+ * stored for the huge page sizes that are valid.
+ */
+unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
+
+#define hugepte_shift mmu_huge_psizes
+#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize])
+#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize])
+
+#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
+ + hugepte_shift[psize])
+#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
+#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
+
+/* Subtract one from array size because we don't need a cache for 4K since
+ * is not a huge page size */
+#define huge_pgtable_cache(psize) (pgtable_cache[HUGEPTE_CACHE_NUM \
+ + psize-1])
+#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
-#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
+static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
+ "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
+ "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
+};
/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
* will choke on pointers to hugepte tables, which is handy for
@@ -49,24 +71,49 @@ typedef struct { unsigned long pd; } hugepd_t;
#define hugepd_none(hpd) ((hpd).pd == 0)
+static inline int shift_to_mmu_psize(unsigned int shift)
+{
+ switch (shift) {
+#ifndef CONFIG_PPC_64K_PAGES
+ case PAGE_SHIFT_64K:
+ return MMU_PAGE_64K;
+#endif
+ case PAGE_SHIFT_16M:
+ return MMU_PAGE_16M;
+ case PAGE_SHIFT_16G:
+ return MMU_PAGE_16G;
+ }
+ return -1;
+}
+
+static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
+{
+ if (mmu_psize_defs[mmu_psize].shift)
+ return mmu_psize_defs[mmu_psize].shift;
+ BUG();
+}
+
static inline pte_t *hugepd_page(hugepd_t hpd)
{
BUG_ON(!(hpd.pd & HUGEPD_OK));
return (pte_t *)(hpd.pd & ~HUGEPD_OK);
}
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
+ struct hstate *hstate)
{
- unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+ unsigned int shift = huge_page_shift(hstate);
+ int psize = shift_to_mmu_psize(shift);
+ unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
pte_t *dir = hugepd_page(*hpdp);
return dir + idx;
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
- unsigned long address)
+ unsigned long address, unsigned int psize)
{
- pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+ pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
GFP_KERNEL|__GFP_REPEAT);
if (! new)
@@ -74,7 +121,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
spin_lock(&mm->page_table_lock);
if (!hugepd_none(*hpdp))
- kmem_cache_free(huge_pgtable_cache, new);
+ kmem_cache_free(huge_pgtable_cache(psize), new);
else
hpdp->pd = (unsigned long)new | HUGEPD_OK;
spin_unlock(&mm->page_table_lock);
@@ -83,27 +130,60 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
/* Base page size affects how we walk hugetlb page tables */
#ifdef CONFIG_PPC_64K_PAGES
-#define hpmd_offset(pud, addr) pmd_offset(pud, addr)
-#define hpmd_alloc(mm, pud, addr) pmd_alloc(mm, pud, addr)
+#define hpmd_offset(pud, addr, h) pmd_offset(pud, addr)
+#define hpmd_alloc(mm, pud, addr, h) pmd_alloc(mm, pud, addr)
#else
static inline
-pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
+pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
{
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+ if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
return pmd_offset(pud, addr);
else
return (pmd_t *) pud;
}
static inline
-pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
+pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
+ struct hstate *hstate)
{
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
+ if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
return pmd_alloc(mm, pud, addr);
else
return (pmd_t *) pud;
}
#endif
+/* Build list of addresses of gigantic pages. This function is used in early
+ * boot before the buddy or bootmem allocator is setup.
+ */
+void add_gpage(unsigned long addr, unsigned long page_size,
+ unsigned long number_of_pages)
+{
+ if (!addr)
+ return;
+ while (number_of_pages > 0) {
+ gpage_freearray[nr_gpages] = addr;
+ nr_gpages++;
+ number_of_pages--;
+ addr += page_size;
+ }
+}
+
+/* Moves the gigantic page addresses from the temporary list to the
+ * huge_boot_pages list.
+ */
+int alloc_bootmem_huge_page(struct hstate *hstate)
+{
+ struct huge_bootmem_page *m;
+ if (nr_gpages == 0)
+ return 0;
+ m = phys_to_virt(gpage_freearray[--nr_gpages]);
+ gpage_freearray[nr_gpages] = 0;
+ list_add(&m->list, &huge_boot_pages);
+ m->hstate = hstate;
+ return 1;
+}
+
+
/* Modelled after find_linux_pte() */
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
@@ -111,39 +191,52 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
pud_t *pu;
pmd_t *pm;
- BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+ unsigned int psize;
+ unsigned int shift;
+ unsigned long sz;
+ struct hstate *hstate;
+ psize = get_slice_psize(mm, addr);
+ shift = mmu_psize_to_shift(psize);
+ sz = ((1UL) << shift);
+ hstate = size_to_hstate(sz);
- addr &= HPAGE_MASK;
+ addr &= hstate->mask;
pg = pgd_offset(mm, addr);
if (!pgd_none(*pg)) {
pu = pud_offset(pg, addr);
if (!pud_none(*pu)) {
- pm = hpmd_offset(pu, addr);
+ pm = hpmd_offset(pu, addr, hstate);
if (!pmd_none(*pm))
- return hugepte_offset((hugepd_t *)pm, addr);
+ return hugepte_offset((hugepd_t *)pm, addr,
+ hstate);
}
}
return NULL;
}
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+ unsigned long addr, unsigned long sz)
{
pgd_t *pg;
pud_t *pu;
pmd_t *pm;
hugepd_t *hpdp = NULL;
+ struct hstate *hstate;
+ unsigned int psize;
+ hstate = size_to_hstate(sz);
- BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
+ psize = get_slice_psize(mm, addr);
+ BUG_ON(!mmu_huge_psizes[psize]);
- addr &= HPAGE_MASK;
+ addr &= hstate->mask;
pg = pgd_offset(mm, addr);
pu = pud_alloc(mm, pg, addr);
if (pu) {
- pm = hpmd_alloc(mm, pu, addr);
+ pm = hpmd_alloc(mm, pu, addr, hstate);
if (pm)
hpdp = (hugepd_t *)pm;
}
@@ -151,10 +244,10 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
if (! hpdp)
return NULL;
- if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+ if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
return NULL;
- return hugepte_offset(hpdp, addr);
+ return hugepte_offset(hpdp, addr, hstate);
}
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
@@ -162,19 +255,22 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
return 0;
}
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
+ unsigned int psize)
{
pte_t *hugepte = hugepd_page(*hpdp);
hpdp->pd = 0;
tlb->need_flush = 1;
- pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+ pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
+ HUGEPTE_CACHE_NUM+psize-1,
PGF_CACHENUM_MASK));
}
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
unsigned long addr, unsigned long end,
- unsigned long floor, unsigned long ceiling)
+ unsigned long floor, unsigned long ceiling,
+ unsigned int psize)
{
pmd_t *pmd;
unsigned long next;
@@ -186,7 +282,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd))
continue;
- free_hugepte_range(tlb, (hugepd_t *)pmd);
+ free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
} while (pmd++, addr = next, addr != end);
start &= PUD_MASK;
@@ -212,6 +308,9 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
pud_t *pud;
unsigned long next;
unsigned long start;
+ unsigned int shift;
+ unsigned int psize = get_slice_psize(tlb->mm, addr);
+ shift = mmu_psize_to_shift(psize);
start = addr;
pud = pud_offset(pgd, addr);
@@ -220,16 +319,18 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
#ifdef CONFIG_PPC_64K_PAGES
if (pud_none_or_clear_bad(pud))
continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+ hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling,
+ psize);
#else
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K) {
+ if (shift == PAGE_SHIFT_64K) {
if (pud_none_or_clear_bad(pud))
continue;
- hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+ hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
+ ceiling, psize);
} else {
if (pud_none(*pud))
continue;
- free_hugepte_range(tlb, (hugepd_t *)pud);
+ free_hugepte_range(tlb, (hugepd_t *)pud, psize);
}
#endif
} while (pud++, addr = next, addr != end);
@@ -255,7 +356,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
*
* Must be called with pagetable lock held.
*/
-void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+void hugetlb_free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
unsigned long floor, unsigned long ceiling)
{
@@ -297,31 +398,33 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb,
* now has no other vmas using it, so can be freed, we don't
* bother to round floor or end up - the tests don't need that.
*/
+ unsigned int psize = get_slice_psize(tlb->mm, addr);
- addr &= HUGEPD_MASK;
+ addr &= HUGEPD_MASK(psize);
if (addr < floor) {
- addr += HUGEPD_SIZE;
+ addr += HUGEPD_SIZE(psize);
if (!addr)
return;
}
if (ceiling) {
- ceiling &= HUGEPD_MASK;
+ ceiling &= HUGEPD_MASK(psize);
if (!ceiling)
return;
}
if (end - 1 > ceiling - 1)
- end -= HUGEPD_SIZE;
+ end -= HUGEPD_SIZE(psize);
if (addr > end - 1)
return;
start = addr;
- pgd = pgd_offset((*tlb)->mm, addr);
+ pgd = pgd_offset(tlb->mm, addr);
do {
- BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
+ psize = get_slice_psize(tlb->mm, addr);
+ BUG_ON(!mmu_huge_psizes[psize]);
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+ hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
}
@@ -334,7 +437,11 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
* necessary anymore if we make hpte_need_flush() get the
* page size from the slices
*/
- pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
+ unsigned int psize = get_slice_psize(mm, addr);
+ unsigned int shift = mmu_psize_to_shift(psize);
+ unsigned long sz = ((1UL) << shift);
+ struct hstate *hstate = size_to_hstate(sz);
+ pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
}
*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
}
@@ -351,14 +458,19 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
pte_t *ptep;
struct page *page;
+ unsigned int mmu_psize = get_slice_psize(mm, address);
- if (get_slice_psize(mm, address) != mmu_huge_psize)
+ /* Verify it is a huge page else bail. */
+ if (!mmu_huge_psizes[mmu_psize])
return ERR_PTR(-EINVAL);
ptep = huge_pte_offset(mm, address);
page = pte_page(*ptep);
- if (page)
- page += (address % HPAGE_SIZE) / PAGE_SIZE;
+ if (page) {
+ unsigned int shift = mmu_psize_to_shift(mmu_psize);
+ unsigned long sz = ((1UL) << shift);
+ page += (address % sz) / PAGE_SIZE;
+ }
return page;
}
@@ -368,6 +480,11 @@ int pmd_huge(pmd_t pmd)
return 0;
}
+int pud_huge(pud_t pud)
+{
+ return 0;
+}
+
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
@@ -381,15 +498,16 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags)
{
- return slice_get_unmapped_area(addr, len, flags,
- mmu_huge_psize, 1, 0);
+ struct hstate *hstate = hstate_file(file);
+ int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
+ return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
}
/*
* Called by asm hashtable.S for doing lazy icache flush
*/
static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
- pte_t pte, int trap)
+ pte_t pte, int trap, unsigned long sz)
{
struct page *page;
int i;
@@ -402,7 +520,7 @@ static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
/* page is dirty */
if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
if (trap == 0x400) {
- for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
+ for (i = 0; i < (sz / PAGE_SIZE); i++)
__flush_dcache_icache(page_address(page+i));
set_bit(PG_arch_1, &page->flags);
} else {
@@ -418,11 +536,16 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
{
pte_t *ptep;
unsigned long old_pte, new_pte;
- unsigned long va, rflags, pa;
+ unsigned long va, rflags, pa, sz;
long slot;
int err = 1;
int ssize = user_segment_size(ea);
+ unsigned int mmu_psize;
+ int shift;
+ mmu_psize = get_slice_psize(mm, ea);
+ if (!mmu_huge_psizes[mmu_psize])
+ goto out;
ptep = huge_pte_offset(mm, ea);
/* Search the Linux page table for a match with va */
@@ -465,30 +588,32 @@ int hash_huge_page(struct mm_struct *mm, unsigned long access,
rflags = 0x2 | (!(new_pte & _PAGE_RW));
/* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+ shift = mmu_psize_to_shift(mmu_psize);
+ sz = ((1UL) << shift);
if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
/* No CPU has hugepages but lacks no execute, so we
* don't need to worry about that case */
rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
- trap);
+ trap, sz);
/* Check if pte already has an hpte (case 2) */
if (unlikely(old_pte & _PAGE_HASHPTE)) {
/* There MIGHT be an HPTE for this pte */
unsigned long hash, slot;
- hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+ hash = hpt_hash(va, shift, ssize);
if (old_pte & _PAGE_F_SECOND)
hash = ~hash;
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
slot += (old_pte & _PAGE_F_GIX) >> 12;
- if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
+ if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
ssize, local) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
if (likely(!(old_pte & _PAGE_HASHPTE))) {
- unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
+ unsigned long hash = hpt_hash(va, shift, ssize);
unsigned long hpte_group;
pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
@@ -509,7 +634,7 @@ repeat:
/* Insert into the hash table, primary slot */
slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
- mmu_huge_psize, ssize);
+ mmu_psize, ssize);
/* Primary is full, try the secondary */
if (unlikely(slot == -1)) {
@@ -517,7 +642,7 @@ repeat:
HPTES_PER_GROUP) & ~0x7UL;
slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
HPTE_V_SECONDARY,
- mmu_huge_psize, ssize);
+ mmu_psize, ssize);
if (slot == -1) {
if (mftb() & 0x1)
hpte_group = ((hash & htab_hash_mask) *
@@ -549,45 +674,54 @@ void set_huge_psize(int psize)
{
/* Check that it is a page size supported by the hardware and
* that it fits within pagetable limits. */
- if (mmu_psize_defs[psize].shift && mmu_psize_defs[psize].shift < SID_SHIFT &&
+ if (mmu_psize_defs[psize].shift &&
+ mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
(mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
- mmu_psize_defs[psize].shift == HPAGE_SHIFT_64K)) {
- HPAGE_SHIFT = mmu_psize_defs[psize].shift;
- mmu_huge_psize = psize;
-#ifdef CONFIG_PPC_64K_PAGES
- hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
-#else
- if (HPAGE_SHIFT == HPAGE_SHIFT_64K)
- hugepte_shift = (PMD_SHIFT-HPAGE_SHIFT);
- else
- hugepte_shift = (PUD_SHIFT-HPAGE_SHIFT);
-#endif
-
+ mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
+ mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
+ /* Return if huge page size has already been setup or is the
+ * same as the base page size. */
+ if (mmu_huge_psizes[psize] ||
+ mmu_psize_defs[psize].shift == PAGE_SHIFT)
+ return;
+ hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
+
+ switch (mmu_psize_defs[psize].shift) {
+ case PAGE_SHIFT_64K:
+ /* We only allow 64k hpages with 4k base page,
+ * which was checked above, and always put them
+ * at the PMD */
+ hugepte_shift[psize] = PMD_SHIFT;
+ break;
+ case PAGE_SHIFT_16M:
+ /* 16M pages can be at two different levels
+ * of pagestables based on base page size */
+ if (PAGE_SHIFT == PAGE_SHIFT_64K)
+ hugepte_shift[psize] = PMD_SHIFT;
+ else /* 4k base page */
+ hugepte_shift[psize] = PUD_SHIFT;
+ break;
+ case PAGE_SHIFT_16G:
+ /* 16G pages are always at PGD level */
+ hugepte_shift[psize] = PGDIR_SHIFT;
+ break;
+ }
+ hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
} else
- HPAGE_SHIFT = 0;
+ hugepte_shift[psize] = 0;
}
static int __init hugepage_setup_sz(char *str)
{
unsigned long long size;
- int mmu_psize = -1;
+ int mmu_psize;
int shift;
size = memparse(str, &str);
shift = __ffs(size);
- switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
- case HPAGE_SHIFT_64K:
- mmu_psize = MMU_PAGE_64K;
- break;
-#endif
- case HPAGE_SHIFT_16M:
- mmu_psize = MMU_PAGE_16M;
- break;
- }
-
- if (mmu_psize >=0 && mmu_psize_defs[mmu_psize].shift)
+ mmu_psize = shift_to_mmu_psize(shift);
+ if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
set_huge_psize(mmu_psize);
else
printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
@@ -596,23 +730,40 @@ static int __init hugepage_setup_sz(char *str)
}
__setup("hugepagesz=", hugepage_setup_sz);
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
- memset(addr, 0, kmem_cache_size(cache));
-}
-
static int __init hugetlbpage_init(void)
{
+ unsigned int psize;
+
if (!cpu_has_feature(CPU_FTR_16M_PAGE))
return -ENODEV;
- huge_pgtable_cache = kmem_cache_create("hugepte_cache",
- HUGEPTE_TABLE_SIZE,
- HUGEPTE_TABLE_SIZE,
- 0,
- zero_ctor);
- if (! huge_pgtable_cache)
- panic("hugetlbpage_init(): could not create hugepte cache\n");
+ /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE
+ * and adjust PTE_NONCACHE_NUM if the number of supported huge page
+ * sizes changes.
+ */
+ set_huge_psize(MMU_PAGE_16M);
+ set_huge_psize(MMU_PAGE_16G);
+
+ /* Temporarily disable support for 64K huge pages when 64K SPU local
+ * store support is enabled as the current implementation conflicts.
+ */
+#ifndef CONFIG_SPU_FS_64K_LS
+ set_huge_psize(MMU_PAGE_64K);
+#endif
+
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ if (mmu_huge_psizes[psize]) {
+ huge_pgtable_cache(psize) = kmem_cache_create(
+ HUGEPTE_CACHE_NAME(psize),
+ HUGEPTE_TABLE_SIZE(psize),
+ HUGEPTE_TABLE_SIZE(psize),
+ 0,
+ NULL);
+ if (!huge_pgtable_cache(psize))
+ panic("hugetlbpage_init(): could not create %s"\
+ "\n", HUGEPTE_CACHE_NAME(psize));
+ }
+ }
return 0;
}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 6ef63caca682..036fe2f10c77 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
module_init(setup_kcore);
#endif
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
{
- memset(addr, 0, kmem_cache_size(cache));
+ memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+ memset(addr, 0, PMD_TABLE_SIZE);
}
static const unsigned int pgtable_cache_size[2] = {
@@ -153,29 +158,18 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
};
#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need one extra cache, initialized in hugetlbpage.c. We
- * can't put into the tables above, because HPAGE_SHIFT is not compile
- * time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+/* Hugepages need an extra cache per hugepagesize, initialized in
+ * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT
+ * is not compile time constant. */
+struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
#else
struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
#endif
void pgtable_cache_init(void)
{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
- int size = pgtable_cache_size[i];
- const char *name = pgtable_cache_name[i];
-
- pr_debug("Allocating page table cache %s (#%d) "
- "for size: %08x...\n", name, i, size);
- pgtable_cache[i] = kmem_cache_create(name,
- size, size,
- SLAB_PANIC,
- zero_ctor);
- }
+ pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+ pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
}
#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -212,13 +206,10 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
int __meminit vmemmap_populate(struct page *start_page,
unsigned long nr_pages, int node)
{
- unsigned long mode_rw;
unsigned long start = (unsigned long)start_page;
unsigned long end = (unsigned long)(start_page + nr_pages);
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
- mode_rw = _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_COHERENT | PP_RWXX;
-
/* Align to the page size of the linear mapping. */
start = _ALIGN_DOWN(start, page_size);
@@ -236,9 +227,9 @@ int __meminit vmemmap_populate(struct page *start_page,
pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n",
start, p, __pa(p));
- mapped = htab_bolt_mapping(start, start + page_size,
- __pa(p), mode_rw, mmu_vmemmap_psize,
- mmu_kernel_ssize);
+ mapped = htab_bolt_mapping(start, start + page_size, __pa(p),
+ PAGE_KERNEL, mmu_vmemmap_psize,
+ mmu_kernel_ssize);
BUG_ON(mapped < 0);
}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1ca2235f0965..1c93c255873b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -186,45 +186,6 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
}
EXPORT_SYMBOL_GPL(walk_memory_resource);
-void show_mem(void)
-{
- unsigned long total = 0, reserved = 0;
- unsigned long shared = 0, cached = 0;
- unsigned long highmem = 0;
- struct page *page;
- pg_data_t *pgdat;
- unsigned long i;
-
- printk("Mem-info:\n");
- show_free_areas();
- for_each_online_pgdat(pgdat) {
- unsigned long flags;
- pgdat_resize_lock(pgdat, &flags);
- for (i = 0; i < pgdat->node_spanned_pages; i++) {
- if (!pfn_valid(pgdat->node_start_pfn + i))
- continue;
- page = pgdat_page_nr(pgdat, i);
- total++;
- if (PageHighMem(page))
- highmem++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
- }
- pgdat_resize_unlock(pgdat, &flags);
- }
- printk("%ld pages of RAM\n", total);
-#ifdef CONFIG_HIGHMEM
- printk("%ld pages of HIGHMEM\n", highmem);
-#endif
- printk("%ld reserved pages\n", reserved);
- printk("%ld pages shared\n", shared);
- printk("%ld pages swap cached\n", cached);
-}
-
/*
* Initialize the bootmem system and give it all the memory we
* have available. If we are using highmem, we only put the
@@ -350,7 +311,7 @@ void __init paging_init(void)
#endif /* CONFIG_HIGHMEM */
printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%lx\n",
- (u64)top_of_ram, total_ram);
+ (unsigned long long)top_of_ram, total_ram);
printk(KERN_DEBUG "Memory hole size: %ldMB\n",
(long int)((top_of_ram - total_ram) >> 20));
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index cf4bffba6f7c..d9a181351332 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -39,7 +39,6 @@ EXPORT_SYMBOL(numa_cpu_lookup_table);
EXPORT_SYMBOL(numa_cpumask_lookup_table);
EXPORT_SYMBOL(node_data);
-static bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
static int min_common_depth;
static int n_mem_addr_cells, n_mem_size_cells;
@@ -816,7 +815,7 @@ void __init do_init_bootmem(void)
dbg("node %d\n", nid);
dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
- NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+ NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index c7584072dfcc..2001abdb1912 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -145,13 +145,20 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage)
void __iomem *
ioremap(phys_addr_t addr, unsigned long size)
{
- return __ioremap(addr, size, _PAGE_NO_CACHE);
+ return __ioremap(addr, size, _PAGE_NO_CACHE | _PAGE_GUARDED);
}
EXPORT_SYMBOL(ioremap);
void __iomem *
ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags)
{
+ /* writeable implies dirty for kernel addresses */
+ if (flags & _PAGE_RW)
+ flags |= _PAGE_DIRTY | _PAGE_HWWRITE;
+
+ /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+ flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC);
+
return __ioremap(addr, size, flags);
}
EXPORT_SYMBOL(ioremap_flags);
@@ -163,6 +170,14 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
phys_addr_t p;
int err;
+ /* Make sure we have the base flags */
+ if ((flags & _PAGE_PRESENT) == 0)
+ flags |= _PAGE_KERNEL;
+
+ /* Non-cacheable page cannot be coherent */
+ if (flags & _PAGE_NO_CACHE)
+ flags &= ~_PAGE_COHERENT;
+
/*
* Choose an address to map it to.
* Once the vmalloc system is running, we use it.
@@ -219,11 +234,6 @@ __ioremap(phys_addr_t addr, unsigned long size, unsigned long flags)
v = (ioremap_bot -= size);
}
- if ((flags & _PAGE_PRESENT) == 0)
- flags |= _PAGE_KERNEL;
- if (flags & _PAGE_NO_CACHE)
- flags |= _PAGE_GUARDED;
-
/*
* Should check if it is a candidate for a BAT mapping
*/
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3ef0ad2f9ca0..365e61ae5dbc 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -107,9 +107,18 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size,
{
unsigned long i;
+ /* Make sure we have the base flags */
if ((flags & _PAGE_PRESENT) == 0)
flags |= pgprot_val(PAGE_KERNEL);
+ /* Non-cacheable page cannot be coherent */
+ if (flags & _PAGE_NO_CACHE)
+ flags &= ~_PAGE_COHERENT;
+
+ /* We don't support the 4K PFN hack with ioremap */
+ if (flags & _PAGE_4K_PFN)
+ return NULL;
+
WARN_ON(pa & ~PAGE_MASK);
WARN_ON(((unsigned long)ea) & ~PAGE_MASK);
WARN_ON(size & ~PAGE_MASK);
@@ -190,6 +199,13 @@ void __iomem * ioremap(phys_addr_t addr, unsigned long size)
void __iomem * ioremap_flags(phys_addr_t addr, unsigned long size,
unsigned long flags)
{
+ /* writeable implies dirty for kernel addresses */
+ if (flags & _PAGE_RW)
+ flags |= _PAGE_DIRTY;
+
+ /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
+ flags &= ~(_PAGE_USER | _PAGE_EXEC);
+
if (ppc_md.ioremap)
return ppc_md.ioremap(addr, size, flags);
return __ioremap(addr, size, flags);
diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c
index c53145f61942..6aa120813775 100644
--- a/arch/powerpc/mm/ppc_mmu_32.c
+++ b/arch/powerpc/mm/ppc_mmu_32.c
@@ -236,8 +236,8 @@ void __init MMU_init_hw(void)
Hash_end = (struct hash_pte *) ((unsigned long)Hash + Hash_size);
- printk("Total memory = %ldMB; using %ldkB for hash table (at %p)\n",
- total_memory >> 20, Hash_size >> 10, Hash);
+ printk("Total memory = %lldMB; using %ldkB for hash table (at %p)\n",
+ (unsigned long long)(total_memory >> 20), Hash_size >> 10, Hash);
/*
diff --git a/arch/powerpc/mm/tlb_64.c b/arch/powerpc/mm/tlb_64.c
index a01b5c608ff9..be7dd422c0fa 100644
--- a/arch/powerpc/mm/tlb_64.c
+++ b/arch/powerpc/mm/tlb_64.c
@@ -34,7 +34,7 @@
DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
/* This is declared as we are using the more or less generic
- * include/asm-powerpc/tlb.h file -- tgall
+ * arch/powerpc/include/asm/tlb.h file -- tgall
*/
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
@@ -147,7 +147,7 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
*/
if (huge) {
#ifdef CONFIG_HUGETLB_PAGE
- psize = mmu_huge_psize;
+ psize = get_slice_psize(mm, addr);;
#else
BUG();
psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */