diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 7 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/allocpercpu.c | 129 | ||||
-rw-r--r-- | mm/bootmem.c | 202 | ||||
-rw-r--r-- | mm/bounce.c | 302 | ||||
-rw-r--r-- | mm/filemap.c | 211 | ||||
-rw-r--r-- | mm/fremap.c | 6 | ||||
-rw-r--r-- | mm/highmem.c | 294 | ||||
-rw-r--r-- | mm/hugetlb.c | 10 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memory.c | 215 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 71 | ||||
-rw-r--r-- | mm/mempolicy.c | 26 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/mmap.c | 19 | ||||
-rw-r--r-- | mm/mprotect.c | 53 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 196 | ||||
-rw-r--r-- | mm/nommu.c | 250 | ||||
-rw-r--r-- | mm/oom_kill.c | 126 | ||||
-rw-r--r-- | mm/page-writeback.c | 198 | ||||
-rw-r--r-- | mm/page_alloc.c | 976 | ||||
-rw-r--r-- | mm/page_io.c | 48 | ||||
-rw-r--r-- | mm/rmap.c | 65 | ||||
-rw-r--r-- | mm/shmem.c | 122 | ||||
-rw-r--r-- | mm/shmem_acl.c | 197 | ||||
-rw-r--r-- | mm/slab.c | 458 | ||||
-rw-r--r-- | mm/slob.c | 52 | ||||
-rw-r--r-- | mm/swap.c | 49 | ||||
-rw-r--r-- | mm/swapfile.c | 7 | ||||
-rw-r--r-- | mm/truncate.c | 85 | ||||
-rw-r--r-- | mm/util.c | 18 | ||||
-rw-r--r-- | mm/vmalloc.c | 38 | ||||
-rw-r--r-- | mm/vmscan.c | 140 | ||||
-rw-r--r-- | mm/vmstat.c | 52 |
35 files changed, 3199 insertions, 1441 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8f5b45615f7b..5d88489ef2de 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -115,12 +115,17 @@ config SPARSEMEM_EXTREME # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" - depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG + depends on SPARSEMEM || X86_64_ACPI_NUMA + depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC64) comment "Memory hotplug is currently incompatible with Software Suspend" depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND +config MEMORY_HOTPLUG_SPARSE + def_bool y + depends on SPARSEMEM && MEMORY_HOTPLUG + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/Makefile b/mm/Makefile index 9dd824c11eeb..12b3a4eee88d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -12,15 +12,19 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o $(mmu-y) +ifeq ($(CONFIG_MMU)$(CONFIG_BLOCK),yy) +obj-y += bounce.o +endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o +obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o - +obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c new file mode 100644 index 000000000000..eaa9abeea536 --- /dev/null +++ b/mm/allocpercpu.c @@ -0,0 +1,129 @@ +/* + * linux/mm/allocpercpu.c + * + * Separated from slab.c August 11, 2006 Christoph Lameter <clameter@sgi.com> + */ +#include <linux/mm.h> +#include <linux/module.h> + +/** + * percpu_depopulate - depopulate per-cpu data for given cpu + * @__pdata: per-cpu data to depopulate + * @cpu: depopulate per-cpu data for this cpu + * + * Depopulating per-cpu data for a cpu going offline would be a typical + * use case. You need to register a cpu hotplug handler for that purpose. + */ +void percpu_depopulate(void *__pdata, int cpu) +{ + struct percpu_data *pdata = __percpu_disguise(__pdata); + if (pdata->ptrs[cpu]) { + kfree(pdata->ptrs[cpu]); + pdata->ptrs[cpu] = NULL; + } +} +EXPORT_SYMBOL_GPL(percpu_depopulate); + +/** + * percpu_depopulate_mask - depopulate per-cpu data for some cpu's + * @__pdata: per-cpu data to depopulate + * @mask: depopulate per-cpu data for cpu's selected through mask bits + */ +void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +{ + int cpu; + for_each_cpu_mask(cpu, *mask) + percpu_depopulate(__pdata, cpu); +} +EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); + +/** + * percpu_populate - populate per-cpu data for given cpu + * @__pdata: per-cpu data to populate further + * @size: size of per-cpu object + * @gfp: may sleep or not etc. + * @cpu: populate per-data for this cpu + * + * Populating per-cpu data for a cpu coming online would be a typical + * use case. You need to register a cpu hotplug handler for that purpose. + * Per-cpu object is populated with zeroed buffer. + */ +void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) +{ + struct percpu_data *pdata = __percpu_disguise(__pdata); + int node = cpu_to_node(cpu); + + BUG_ON(pdata->ptrs[cpu]); + if (node_online(node)) { + /* FIXME: kzalloc_node(size, gfp, node) */ + pdata->ptrs[cpu] = kmalloc_node(size, gfp, node); + if (pdata->ptrs[cpu]) + memset(pdata->ptrs[cpu], 0, size); + } else + pdata->ptrs[cpu] = kzalloc(size, gfp); + return pdata->ptrs[cpu]; +} +EXPORT_SYMBOL_GPL(percpu_populate); + +/** + * percpu_populate_mask - populate per-cpu data for more cpu's + * @__pdata: per-cpu data to populate further + * @size: size of per-cpu object + * @gfp: may sleep or not etc. + * @mask: populate per-cpu data for cpu's selected through mask bits + * + * Per-cpu objects are populated with zeroed buffers. + */ +int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask) +{ + cpumask_t populated = CPU_MASK_NONE; + int cpu; + + for_each_cpu_mask(cpu, *mask) + if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { + __percpu_depopulate_mask(__pdata, &populated); + return -ENOMEM; + } else + cpu_set(cpu, populated); + return 0; +} +EXPORT_SYMBOL_GPL(__percpu_populate_mask); + +/** + * percpu_alloc_mask - initial setup of per-cpu data + * @size: size of per-cpu object + * @gfp: may sleep or not etc. + * @mask: populate per-data for cpu's selected through mask bits + * + * Populating per-cpu data for all online cpu's would be a typical use case, + * which is simplified by the percpu_alloc() wrapper. + * Per-cpu objects are populated with zeroed buffers. + */ +void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) +{ + void *pdata = kzalloc(sizeof(struct percpu_data), gfp); + void *__pdata = __percpu_disguise(pdata); + + if (unlikely(!pdata)) + return NULL; + if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask))) + return __pdata; + kfree(pdata); + return NULL; +} +EXPORT_SYMBOL_GPL(__percpu_alloc_mask); + +/** + * percpu_free - final cleanup of per-cpu data + * @__pdata: object to clean up + * + * We simply clean up any per-cpu object left. No need for the client to + * track and specify through a bis mask which per-cpu objects are to free. + */ +void percpu_free(void *__pdata) +{ + __percpu_depopulate_mask(__pdata, &cpu_possible_map); + kfree(__percpu_disguise(__pdata)); +} +EXPORT_SYMBOL_GPL(percpu_free); diff --git a/mm/bootmem.c b/mm/bootmem.c index 50353e0dac12..d53112fcb404 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -8,17 +8,15 @@ * free memory collector. It's used to deal with reserved * system memory and memory holes as well. */ - -#include <linux/mm.h> -#include <linux/kernel_stat.h> -#include <linux/swap.h> -#include <linux/interrupt.h> #include <linux/init.h> +#include <linux/pfn.h> #include <linux/bootmem.h> -#include <linux/mmzone.h> #include <linux/module.h> -#include <asm/dma.h> + +#include <asm/bug.h> #include <asm/io.h> +#include <asm/processor.h> + #include "internal.h" /* @@ -41,7 +39,7 @@ unsigned long saved_max_pfn; #endif /* return the number of _pages_ that will be allocated for the boot bitmap */ -unsigned long __init bootmem_bootmap_pages (unsigned long pages) +unsigned long __init bootmem_bootmap_pages(unsigned long pages) { unsigned long mapsize; @@ -51,12 +49,14 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) return mapsize; } + /* * link bdata in order */ -static void link_bootmem(bootmem_data_t *bdata) +static void __init link_bootmem(bootmem_data_t *bdata) { bootmem_data_t *ent; + if (list_empty(&bdata_list)) { list_add(&bdata->list, &bdata_list); return; @@ -69,22 +69,32 @@ static void link_bootmem(bootmem_data_t *bdata) } } list_add_tail(&bdata->list, &bdata_list); - return; } +/* + * Given an initialised bdata, it returns the size of the boot bitmap + */ +static unsigned long __init get_mapsize(bootmem_data_t *bdata) +{ + unsigned long mapsize; + unsigned long start = PFN_DOWN(bdata->node_boot_start); + unsigned long end = bdata->node_low_pfn; + + mapsize = ((end - start) + 7) / 8; + return ALIGN(mapsize, sizeof(long)); +} /* * Called once to set up the allocator itself. */ -static unsigned long __init init_bootmem_core (pg_data_t *pgdat, +static unsigned long __init init_bootmem_core(pg_data_t *pgdat, unsigned long mapstart, unsigned long start, unsigned long end) { bootmem_data_t *bdata = pgdat->bdata; - unsigned long mapsize = ((end - start)+7)/8; + unsigned long mapsize; - mapsize = ALIGN(mapsize, sizeof(long)); - bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); - bdata->node_boot_start = (start << PAGE_SHIFT); + bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); + bdata->node_boot_start = PFN_PHYS(start); bdata->node_low_pfn = end; link_bootmem(bdata); @@ -92,6 +102,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ + mapsize = get_mapsize(bdata); memset(bdata->node_bootmem_map, 0xff, mapsize); return mapsize; @@ -102,22 +113,22 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, * might be used for boot-time allocations - or it might get added * to the free page pool later on. */ -static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, + unsigned long size) { + unsigned long sidx, eidx; unsigned long i; + /* * round up, partially reserved pages are considered * fully reserved. */ - unsigned long sidx = (addr - bdata->node_boot_start)/PAGE_SIZE; - unsigned long eidx = (addr + size - bdata->node_boot_start + - PAGE_SIZE-1)/PAGE_SIZE; - unsigned long end = (addr + size + PAGE_SIZE-1)/PAGE_SIZE; - BUG_ON(!size); - BUG_ON(sidx >= eidx); - BUG_ON((addr >> PAGE_SHIFT) >= bdata->node_low_pfn); - BUG_ON(end > bdata->node_low_pfn); + BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn); + BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn); + + sidx = PFN_DOWN(addr - bdata->node_boot_start); + eidx = PFN_UP(addr + size - bdata->node_boot_start); for (i = sidx; i < eidx; i++) if (test_and_set_bit(i, bdata->node_bootmem_map)) { @@ -127,20 +138,18 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add } } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, unsigned long size) +static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, + unsigned long size) { + unsigned long sidx, eidx; unsigned long i; - unsigned long start; + /* * round down end of usable mem, partially free pages are * considered reserved. */ - unsigned long sidx; - unsigned long eidx = (addr + size - bdata->node_boot_start)/PAGE_SIZE; - unsigned long end = (addr + size)/PAGE_SIZE; - BUG_ON(!size); - BUG_ON(end > bdata->node_low_pfn); + BUG_ON(PFN_DOWN(addr + size) > bdata->node_low_pfn); if (addr < bdata->last_success) bdata->last_success = addr; @@ -148,8 +157,8 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, /* * Round up the beginning of the address. */ - start = (addr + PAGE_SIZE-1) / PAGE_SIZE; - sidx = start - (bdata->node_boot_start/PAGE_SIZE); + sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + eidx = PFN_DOWN(addr + size - bdata->node_boot_start); for (i = sidx; i < eidx; i++) { if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) @@ -175,10 +184,10 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { unsigned long offset, remaining_size, areasize, preferred; - unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn; + unsigned long i, start = 0, incr, eidx, end_pfn; void *ret; - if(!size) { + if (!size) { printk("__alloc_bootmem_core(): zero-sized request\n"); BUG(); } @@ -187,23 +196,22 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, if (limit && bdata->node_boot_start >= limit) return NULL; - limit >>=PAGE_SHIFT; + end_pfn = bdata->node_low_pfn; + limit = PFN_DOWN(limit); if (limit && end_pfn > limit) end_pfn = limit; - eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); offset = 0; - if (align && - (bdata->node_boot_start & (align - 1UL)) != 0) - offset = (align - (bdata->node_boot_start & (align - 1UL))); - offset >>= PAGE_SHIFT; + if (align && (bdata->node_boot_start & (align - 1UL)) != 0) + offset = align - (bdata->node_boot_start & (align - 1UL)); + offset = PFN_DOWN(offset); /* * We try to allocate bootmem pages above 'goal' * first, then we try to allocate lower pages. */ - if (goal && (goal >= bdata->node_boot_start) && - ((goal >> PAGE_SHIFT) < end_pfn)) { + if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { preferred = goal - bdata->node_boot_start; if (bdata->last_success >= preferred) @@ -212,9 +220,8 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, } else preferred = 0; - preferred = ALIGN(preferred, align) >> PAGE_SHIFT; - preferred += offset; - areasize = (size+PAGE_SIZE-1)/PAGE_SIZE; + preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; + areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; incr = align >> PAGE_SHIFT ? : 1; restart_scan: @@ -229,7 +236,7 @@ restart_scan: for (j = i + 1; j < i + areasize; ++j) { if (j >= eidx) goto fail_block; - if (test_bit (j, bdata->node_bootmem_map)) + if (test_bit(j, bdata->node_bootmem_map)) goto fail_block; } start = i; @@ -245,7 +252,7 @@ restart_scan: return NULL; found: - bdata->last_success = start << PAGE_SHIFT; + bdata->last_success = PFN_PHYS(start); BUG_ON(start >= eidx); /* @@ -257,19 +264,21 @@ found: bdata->last_offset && bdata->last_pos+1 == start) { offset = ALIGN(bdata->last_offset, align); BUG_ON(offset > PAGE_SIZE); - remaining_size = PAGE_SIZE-offset; + remaining_size = PAGE_SIZE - offset; if (size < remaining_size) { areasize = 0; /* last_pos unchanged */ - bdata->last_offset = offset+size; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); + bdata->last_offset = offset + size; + ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + + offset + + bdata->node_boot_start); } else { remaining_size = size - remaining_size; - areasize = (remaining_size+PAGE_SIZE-1)/PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos*PAGE_SIZE + offset + - bdata->node_boot_start); - bdata->last_pos = start+areasize-1; + areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; + ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + + offset + + bdata->node_boot_start); + bdata->last_pos = start + areasize - 1; bdata->last_offset = remaining_size; } bdata->last_offset &= ~PAGE_MASK; @@ -282,7 +291,7 @@ found: /* * Reserve the area now: */ - for (i = start; i < start+areasize; i++) + for (i = start; i < start + areasize; i++) if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) BUG(); memset(ret, 0, size); @@ -303,8 +312,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) count = 0; /* first extant page of the node */ - pfn = bdata->node_boot_start >> PAGE_SHIFT; - idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); + pfn = PFN_DOWN(bdata->node_boot_start); + idx = bdata->node_low_pfn - pfn; map = bdata->node_bootmem_map; /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ if (bdata->node_boot_start == 0 || @@ -333,7 +342,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) } } } else { - i+=BITS_PER_LONG; + i += BITS_PER_LONG; } pfn += BITS_PER_LONG; } @@ -345,9 +354,10 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) */ page = virt_to_page(bdata->node_bootmem_map); count = 0; - for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { - count++; + idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; + for (i = 0; i < idx; i++, page++) { __free_pages_bootmem(page, 0); + count++; } total += count; bdata->node_bootmem_map = NULL; @@ -355,64 +365,72 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) return total; } -unsigned long __init init_bootmem_node (pg_data_t *pgdat, unsigned long freepfn, unsigned long startpfn, unsigned long endpfn) +unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, + unsigned long startpfn, unsigned long endpfn) { - return(init_bootmem_core(pgdat, freepfn, startpfn, endpfn)); + return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); } -void __init reserve_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { reserve_bootmem_core(pgdat->bdata, physaddr, size); } -void __init free_bootmem_node (pg_data_t *pgdat, unsigned long physaddr, unsigned long size) +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { free_bootmem_core(pgdat->bdata, physaddr, size); } -unsigned long __init free_all_bootmem_node (pg_data_t *pgdat) +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { - return(free_all_bootmem_core(pgdat)); + return free_all_bootmem_core(pgdat); } -unsigned long __init init_bootmem (unsigned long start, unsigned long pages) +unsigned long __init init_bootmem(unsigned long start, unsigned long pages) { max_low_pfn = pages; min_low_pfn = start; - return(init_bootmem_core(NODE_DATA(0), start, 0, pages)); + return init_bootmem_core(NODE_DATA(0), start, 0, pages); } #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -void __init reserve_bootmem (unsigned long addr, unsigned long size) +void __init reserve_bootmem(unsigned long addr, unsigned long size) { reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); } #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ -void __init free_bootmem (unsigned long addr, unsigned long size) +void __init free_bootmem(unsigned long addr, unsigned long size) { free_bootmem_core(NODE_DATA(0)->bdata, addr, size); } -unsigned long __init free_all_bootmem (void) +unsigned long __init free_all_bootmem(void) { - return(free_all_bootmem_core(NODE_DATA(0))); + return free_all_bootmem_core(NODE_DATA(0)); } -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) { bootmem_data_t *bdata; void *ptr; - list_for_each_entry(bdata, &bdata_list, list) - if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) - return(ptr); + list_for_each_entry(bdata, &bdata_list, list) { + ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); + if (ptr) + return ptr; + } return NULL; } -void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) { void *mem = __alloc_bootmem_nopanic(size,align,goal); + if (mem) return mem; /* @@ -424,29 +442,34 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned } -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, - unsigned long goal) +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) { void *ptr; ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); if (ptr) - return (ptr); + return ptr; return __alloc_bootmem(size, align, goal); } -#define LOW32LIMIT 0xffffffff +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif -void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) { bootmem_data_t *bdata; void *ptr; - list_for_each_entry(bdata, &bdata_list, list) - if ((ptr = __alloc_bootmem_core(bdata, size, - align, goal, LOW32LIMIT))) - return(ptr); + list_for_each_entry(bdata, &bdata_list, list) { + ptr = __alloc_bootmem_core(bdata, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + } /* * Whoops, we cannot satisfy the allocation request. @@ -459,5 +482,6 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsig void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); + return __alloc_bootmem_core(pgdat->bdata, size, align, goal, + ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c new file mode 100644 index 000000000000..e4b62d2a4024 --- /dev/null +++ b/mm/bounce.c @@ -0,0 +1,302 @@ +/* bounce buffer handling for block devices + * + * - Split from highmem.c + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/mempool.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/highmem.h> +#include <linux/blktrace_api.h> +#include <asm/tlbflush.h> + +#define POOL_SIZE 64 +#define ISA_POOL_SIZE 16 + +static mempool_t *page_pool, *isa_page_pool; + +#ifdef CONFIG_HIGHMEM +static __init int init_emergency_pool(void) +{ + struct sysinfo i; + si_meminfo(&i); + si_swapinfo(&i); + + if (!i.totalhigh) + return 0; + + page_pool = mempool_create_page_pool(POOL_SIZE, 0); + BUG_ON(!page_pool); + printk("highmem bounce pool size: %d pages\n", POOL_SIZE); + + return 0; +} + +__initcall(init_emergency_pool); + +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ + unsigned long flags; + unsigned char *vto; + + local_irq_save(flags); + vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto, KM_BOUNCE_READ); + local_irq_restore(flags); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom) \ + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif /* CONFIG_HIGHMEM */ + +/* + * allocate pages in the DMA region for the ISA pool + */ +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) +{ + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); +} + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ + if (isa_page_pool) + return 0; + + isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); + BUG_ON(!isa_page_pool); + + printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); + return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ + unsigned char *vfrom; + struct bio_vec *tovec, *fromvec; + int i; + + __bio_for_each_segment(tovec, to, i, 0) { + fromvec = from->bi_io_vec + i; + + /* + * not bounced + */ + if (tovec->bv_page == fromvec->bv_page) + continue; + + /* + * fromvec->bv_offset and fromvec->bv_len might have been + * modified by the block layer, so use the original copy, + * bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; + + flush_dcache_page(tovec->bv_page); + bounce_copy_vec(tovec, vfrom); + } +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + struct bio_vec *bvec, *org_vec; + int i; + + if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) + set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + + /* + * free up bounce indirect pages used + */ + __bio_for_each_segment(bvec, bio, i, 0) { + org_vec = bio_orig->bi_io_vec + i; + if (bvec->bv_page == org_vec->bv_page) + continue; + + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + + bio_endio(bio_orig, bio_orig->bi_size, err); + bio_put(bio); +} + +static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + bounce_end_io(bio, page_pool, err); + return 0; +} + +static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + bounce_end_io(bio, isa_page_pool, err); + return 0; +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +{ + struct bio *bio_orig = bio->bi_private; + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + copy_to_high_bio_irq(bio_orig, bio); + + bounce_end_io(bio, pool, err); +} + +static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + __bounce_end_io_read(bio, page_pool, err); + return 0; +} + +static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) +{ + if (bio->bi_size) + return 1; + + __bounce_end_io_read(bio, isa_page_pool, err); + return 0; +} + +static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, + mempool_t *pool) +{ + struct page *page; + struct bio *bio = NULL; + int i, rw = bio_data_dir(*bio_orig); + struct bio_vec *to, *from; + + bio_for_each_segment(from, *bio_orig, i) { + page = from->bv_page; + + /* + * is destination page below bounce pfn? + */ + if (page_to_pfn(page) < q->bounce_pfn) + continue; + + /* + * irk, bounce it + */ + if (!bio) + bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); + + to = bio->bi_io_vec + i; + + to->bv_page = mempool_alloc(pool, q->bounce_gfp); + to->bv_len = from->bv_len; + to->bv_offset = from->bv_offset; + inc_zone_page_state(to->bv_page, NR_BOUNCE); + + if (rw == WRITE) { + char *vto, *vfrom; + + flush_dcache_page(from->bv_page); + vto = page_address(to->bv_page) + to->bv_offset; + vfrom = kmap(from->bv_page) + from->bv_offset; + memcpy(vto, vfrom, to->bv_len); + kunmap(from->bv_page); + } + } + + /* + * no pages bounced + */ + if (!bio) + return; + + /* + * at least one page was bounced, fill in possible non-highmem + * pages + */ + __bio_for_each_segment(from, *bio_orig, i, 0) { + to = bio_iovec_idx(bio, i); + if (!to->bv_page) { + to->bv_page = from->bv_page; + to->bv_len = from->bv_len; + to->bv_offset = from->bv_offset; + } + } + + bio->bi_bdev = (*bio_orig)->bi_bdev; + bio->bi_flags |= (1 << BIO_BOUNCED); + bio->bi_sector = (*bio_orig)->bi_sector; + bio->bi_rw = (*bio_orig)->bi_rw; + + bio->bi_vcnt = (*bio_orig)->bi_vcnt; + bio->bi_idx = (*bio_orig)->bi_idx; + bio->bi_size = (*bio_orig)->bi_size; + + if (pool == page_pool) { + bio->bi_end_io = bounce_end_io_write; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + } else { + bio->bi_end_io = bounce_end_io_write_isa; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read_isa; + } + + bio->bi_private = *bio_orig; + *bio_orig = bio; +} + +void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) +{ + mempool_t *pool; + + /* + * for non-isa bounce case, just check if the bounce pfn is equal + * to or bigger than the highest pfn in the system -- in that case, + * don't waste time iterating over bio segments + */ + if (!(q->bounce_gfp & GFP_DMA)) { + if (q->bounce_pfn >= blk_max_pfn) + return; + pool = page_pool; + } else { + BUG_ON(!isa_page_pool); + pool = isa_page_pool; + } + + blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); + + /* + * slow path + */ + __blk_queue_bounce(q, bio_orig, pool); +} + +EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/filemap.c b/mm/filemap.c index b9a60c43b61a..ec469235985d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -488,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x) EXPORT_SYMBOL(page_cache_alloc_cold); #endif +static int __sleep_on_page_lock(void *word) +{ + io_schedule(); + return 0; +} + /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -577,13 +583,24 @@ void fastcall __lock_page(struct page *page) } EXPORT_SYMBOL(__lock_page); +/* + * Variant of lock_page that does not require the caller to hold a reference + * on the page's mapping. + */ +void fastcall __lock_page_nosync(struct page *page) +{ + DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, + TASK_UNINTERRUPTIBLE); +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * - * A rather lightweight function, finding and getting a reference to a - * hashed page atomically. + * Is there a pagecache struct page at the given (mapping, offset) tuple? + * If yes, increment its refcount and return it; if no, return NULL. */ struct page * find_get_page(struct address_space *mapping, unsigned long offset) { @@ -970,7 +987,7 @@ page_not_up_to_date: /* Get exclusive access to the page ... */ lock_page(page); - /* Did it get unhashed before we got the lock? */ + /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); page_cache_release(page); @@ -1132,13 +1149,14 @@ success: * that can use the page cache directly. */ ssize_t -__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; size_t count; + loff_t *ppos = &iocb->ki_pos; count = 0; for (seg = 0; seg < nr_segs; seg++) { @@ -1162,7 +1180,7 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { - loff_t pos = *ppos, size; + loff_t size; struct address_space *mapping; struct inode *inode; @@ -1206,33 +1224,8 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, out: return retval; } -EXPORT_SYMBOL(__generic_file_aio_read); - -ssize_t -generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) -{ - struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - - BUG_ON(iocb->ki_pos != pos); - return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); -} EXPORT_SYMBOL(generic_file_aio_read); -ssize_t -generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - struct iovec local_iov = { .iov_base = buf, .iov_len = count }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} -EXPORT_SYMBOL(generic_file_read); - int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) { ssize_t written; @@ -1454,7 +1447,7 @@ outside_data_content: * accessible.. */ if (area->vm_mm == current->mm) - return NULL; + return NOPAGE_SIGBUS; /* Fall through to the non-read-ahead case */ no_cached_page: /* @@ -1479,7 +1472,7 @@ no_cached_page: */ if (error == -ENOMEM) return NOPAGE_OOM; - return NULL; + return NOPAGE_SIGBUS; page_not_uptodate: if (!did_readaround) { @@ -1548,7 +1541,7 @@ page_not_uptodate: */ shrink_readahead_size_eio(file, ra); page_cache_release(page); - return NULL; + return NOPAGE_SIGBUS; } EXPORT_SYMBOL(filemap_nopage); @@ -1610,7 +1603,7 @@ no_cached_page: page_not_uptodate: lock_page(page); - /* Did it get unhashed while we waited for it? */ + /* Did it get truncated while we waited for it? */ if (!page->mapping) { unlock_page(page); goto err; @@ -2003,6 +1996,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) *count = inode->i_sb->s_maxbytes - *pos; } else { +#ifdef CONFIG_BLOCK loff_t isize; if (bdev_read_only(I_BDEV(inode))) return -EPERM; @@ -2014,6 +2008,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i if (*pos + *count > isize) *count = isize - *pos; +#else + return -EPERM; +#endif } return 0; } @@ -2294,22 +2291,22 @@ out: current->backing_dev_info = NULL; return written ? written : err; } -EXPORT_SYMBOL(generic_file_aio_write_nolock); -ssize_t -generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; - loff_t pos = *ppos; - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, ppos); + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, + &iocb->ki_pos); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; + ssize_t err; err = sync_page_range_nolock(inode, mapping, pos, ret); if (err < 0) @@ -2317,51 +2314,21 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, } return ret; } +EXPORT_SYMBOL(generic_file_aio_write_nolock); -static ssize_t -__generic_file_write_nolock(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, file); - ret = __generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} - -ssize_t -generic_file_write_nolock(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, file); - ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} -EXPORT_SYMBOL(generic_file_write_nolock); - -ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, - size_t count, loff_t pos) +ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t ret; - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); + ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, + &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { @@ -2375,66 +2342,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, } EXPORT_SYMBOL(generic_file_aio_write); -ssize_t generic_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - struct iovec local_iov = { .iov_base = (void __user *)buf, - .iov_len = count }; - - mutex_lock(&inode->i_mutex); - ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); - mutex_unlock(&inode->i_mutex); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range(inode, mapping, *ppos - ret, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_write); - -ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - return ret; -} -EXPORT_SYMBOL(generic_file_readv); - -ssize_t generic_file_writev(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) -{ - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - mutex_lock(&inode->i_mutex); - ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); - mutex_unlock(&inode->i_mutex); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - err = sync_page_range(inode, mapping, *ppos - ret, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_writev); - /* * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something * went wrong during pagecache shootdown. @@ -2474,3 +2381,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, } return retval; } + +/** + * try_to_release_page() - release old fs-specific metadata on a page + * + * @page: the page which the kernel is trying to free + * @gfp_mask: memory allocation flags (and I/O mode) + * + * The address_space is to try to release any data against the page + * (presumably at page->private). If the release was successful, return `1'. + * Otherwise return zero. + * + * The @gfp_mask argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * + * NOTE: @gfp_mask may go away, and this function may become non-blocking. + */ +int try_to_release_page(struct page *page, gfp_t gfp_mask) +{ + struct address_space * const mapping = page->mapping; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping && mapping->a_ops->releasepage) + return mapping->a_ops->releasepage(page, gfp_mask); + return try_to_free_buffers(page); +} + +EXPORT_SYMBOL(try_to_release_page); diff --git a/mm/fremap.c b/mm/fremap.c index 21b7d0cbc98c..7a9d0f5d246d 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -39,7 +39,7 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(mm, addr, ptep); + pte_clear_not_present_full(mm, addr, ptep, 0); } return !!page; } @@ -79,9 +79,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter(mm, file_rss); flush_icache_page(vma, page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + pte_val = mk_pte(page, prot); + set_pte_at(mm, addr, pte, pte_val); page_add_file_rmap(page); - pte_val = *pte; update_mmu_cache(vma, addr, pte_val); lazy_mmu_prot_update(pte_val); err = 0; diff --git a/mm/highmem.c b/mm/highmem.c index 9b2a5403c447..0206e7e5018c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -29,13 +29,6 @@ #include <linux/blktrace_api.h> #include <asm/tlbflush.h> -static mempool_t *page_pool, *isa_page_pool; - -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -46,6 +39,19 @@ static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) */ #ifdef CONFIG_HIGHMEM +unsigned long totalhigh_pages __read_mostly; + +unsigned int nr_free_highpages (void) +{ + pg_data_t *pgdat; + unsigned int pages = 0; + + for_each_online_pgdat(pgdat) + pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; + + return pages; +} + static int pkmap_count[LAST_PKMAP]; static unsigned int last_pkmap_nr; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); @@ -204,282 +210,8 @@ void fastcall kunmap_high(struct page *page) } EXPORT_SYMBOL(kunmap_high); - -#define POOL_SIZE 64 - -static __init int init_emergency_pool(void) -{ - struct sysinfo i; - si_meminfo(&i); - si_swapinfo(&i); - - if (!i.totalhigh) - return 0; - - page_pool = mempool_create_page_pool(POOL_SIZE, 0); - BUG_ON(!page_pool); - printk("highmem bounce pool size: %d pages\n", POOL_SIZE); - - return 0; -} - -__initcall(init_emergency_pool); - -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned long flags; - unsigned char *vto; - - local_irq_save(flags); - vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto, KM_BOUNCE_READ); - local_irq_restore(flags); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - #endif -#define ISA_POOL_SIZE 16 - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - if (isa_page_pool) - return 0; - - isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(!isa_page_pool); - - printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); - return 0; -} - -/* - * Simple bounce buffer support for highmem pages. Depending on the - * queue gfp mask set, *to may or may not be a highmem page. kmap it - * always, it will do the Right Thing - */ -static void copy_to_high_bio_irq(struct bio *to, struct bio *from) -{ - unsigned char *vfrom; - struct bio_vec *tovec, *fromvec; - int i; - - __bio_for_each_segment(tovec, to, i, 0) { - fromvec = from->bi_io_vec + i; - - /* - * not bounced - */ - if (tovec->bv_page == fromvec->bv_page) - continue; - - /* - * fromvec->bv_offset and fromvec->bv_len might have been - * modified by the block layer, so use the original copy, - * bounce_copy_vec already uses tovec->bv_len - */ - vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - - flush_dcache_page(tovec->bv_page); - bounce_copy_vec(tovec, vfrom); - } -} - -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - struct bio_vec *bvec, *org_vec; - int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); - - /* - * free up bounce indirect pages used - */ - __bio_for_each_segment(bvec, bio, i, 0) { - org_vec = bio_orig->bi_io_vec + i; - if (bvec->bv_page == org_vec->bv_page) - continue; - - dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); - } - - bio_endio(bio_orig, bio_orig->bi_size, err); - bio_put(bio); -} - -static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - bounce_end_io(bio, page_pool, err); - return 0; -} - -static int bounce_end_io_write_isa(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - bounce_end_io(bio, isa_page_pool, err); - return 0; -} - -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) -{ - struct bio *bio_orig = bio->bi_private; - - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) - copy_to_high_bio_irq(bio_orig, bio); - - bounce_end_io(bio, pool, err); -} - -static int bounce_end_io_read(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - __bounce_end_io_read(bio, page_pool, err); - return 0; -} - -static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int err) -{ - if (bio->bi_size) - return 1; - - __bounce_end_io_read(bio, isa_page_pool, err); - return 0; -} - -static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, - mempool_t *pool) -{ - struct page *page; - struct bio *bio = NULL; - int i, rw = bio_data_dir(*bio_orig); - struct bio_vec *to, *from; - - bio_for_each_segment(from, *bio_orig, i) { - page = from->bv_page; - - /* - * is destination page below bounce pfn? - */ - if (page_to_pfn(page) < q->bounce_pfn) - continue; - - /* - * irk, bounce it - */ - if (!bio) - bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); - - to = bio->bi_io_vec + i; - - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - inc_zone_page_state(to->bv_page, NR_BOUNCE); - - if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(from->bv_page); - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap(from->bv_page) + from->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap(from->bv_page); - } - } - - /* - * no pages bounced - */ - if (!bio) - return; - - /* - * at least one page was bounced, fill in possible non-highmem - * pages - */ - __bio_for_each_segment(from, *bio_orig, i, 0) { - to = bio_iovec_idx(bio, i); - if (!to->bv_page) { - to->bv_page = from->bv_page; - to->bv_len = from->bv_len; - to->bv_offset = from->bv_offset; - } - } - - bio->bi_bdev = (*bio_orig)->bi_bdev; - bio->bi_flags |= (1 << BIO_BOUNCED); - bio->bi_sector = (*bio_orig)->bi_sector; - bio->bi_rw = (*bio_orig)->bi_rw; - - bio->bi_vcnt = (*bio_orig)->bi_vcnt; - bio->bi_idx = (*bio_orig)->bi_idx; - bio->bi_size = (*bio_orig)->bi_size; - - if (pool == page_pool) { - bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } - - bio->bi_private = *bio_orig; - *bio_orig = bio; -} - -void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) -{ - mempool_t *pool; - - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (q->bounce_pfn >= blk_max_pfn) - return; - pool = page_pool; - } else { - BUG_ON(!isa_page_pool); - pool = isa_page_pool; - } - - blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool); -} - -EXPORT_SYMBOL(blk_queue_bounce); - #if defined(HASHED_PAGE_VIRTUAL) #define PA_HASH_ORDER 7 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index df499973255f..7c7d03dbf73d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -72,7 +72,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, struct zone **z; for (z = zonelist->zones; *z; z++) { - nid = (*z)->zone_pgdat->node_id; + nid = zone_to_nid(*z); if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && !list_empty(&hugepage_freelists[nid])) break; @@ -177,7 +177,7 @@ static void update_and_free_page(struct page *page) { int i; nr_huge_pages--; - nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; + nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | @@ -191,7 +191,8 @@ static void update_and_free_page(struct page *page) #ifdef CONFIG_HIGHMEM static void try_to_free_low(unsigned long count) { - int i, nid; + int i; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { @@ -199,9 +200,8 @@ static void try_to_free_low(unsigned long count) continue; list_del(&page->lru); update_and_free_page(page); - nid = page_zone(page)->zone_pgdat->node_id; free_huge_pages--; - free_huge_pages_node[nid]--; + free_huge_pages_node[page_to_nid(page)]--; if (count >= nr_huge_pages) return; } diff --git a/mm/internal.h b/mm/internal.h index d20e3cc4aef0..d527b80b292f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,8 +24,8 @@ static inline void set_page_count(struct page *page, int v) */ static inline void set_page_refcounted(struct page *page) { - BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); - BUG_ON(atomic_read(&page->_count)); + VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + VM_BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } diff --git a/mm/memory.c b/mm/memory.c index 109e9866237e..9cf3f341a28a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -49,6 +49,7 @@ #include <linux/module.h> #include <linux/delayacct.h> #include <linux/init.h> +#include <linux/writeback.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -466,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_cow_mapping(vm_flags)) { ptep_set_wrprotect(src_mm, addr, src_pte); - pte = *src_pte; + pte = pte_wrprotect(pte); } /* @@ -505,6 +506,7 @@ again: src_pte = pte_offset_map_nested(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); do { /* @@ -526,6 +528,7 @@ again: progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); @@ -627,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, int anon_rss = 0; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) { @@ -689,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(mm, addr, pte, tlb->fullmm); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss(mm, file_rss, anon_rss); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return addr; @@ -1108,6 +1113,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; + arch_enter_lazy_mmu_mode(); do { struct page *page = ZERO_PAGE(addr); pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); @@ -1117,6 +1123,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1226,7 +1233,12 @@ out: return retval; } -/* +/** + * vm_insert_page - insert single page into user vma + * @vma: user vma to map to + * @addr: target user address of this page + * @page: source kernel page + * * This allows drivers to insert individual pages they've allocated * into a user vma. * @@ -1269,11 +1281,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; + arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1318,7 +1332,16 @@ static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, return 0; } -/* Note: this is only safe if the mm semaphore is held when called. */ +/** + * remap_pfn_range - remap kernel memory to userspace + * @vma: user vma to map to + * @addr: target user address to start at + * @pfn: physical address of kernel memory + * @size: size of map area + * @prot: page protection flags for this mapping + * + * Note: this is only safe if the mm semaphore is held when called. + */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { @@ -1458,14 +1481,29 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse, ret = VM_FAULT_MINOR; + int reuse = 0, ret = VM_FAULT_MINOR; + struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) goto gotten; - if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) == - (VM_SHARED|VM_WRITE))) { + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page)) { + if (!TestSetPageLocked(old_page)) { + reuse = can_share_swap_page(old_page); + unlock_page(old_page); + } + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { /* * Notify the address space that the page is about to @@ -1494,13 +1532,9 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_same(*page_table, orig_pte)) goto unlock; } - + dirty_page = old_page; + get_page(dirty_page); reuse = 1; - } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { - reuse = can_share_swap_page(old_page); - unlock_page(old_page); - } else { - reuse = 0; } if (reuse) { @@ -1551,7 +1585,14 @@ gotten: entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); lazy_mmu_prot_update(entry); - ptep_establish(vma, address, page_table, entry); + /* + * Clear the pte entry and flush it first, before updating the + * pte with the new entry. This will avoid a race condition + * seen in the presence of one thread doing SMC and another + * thread doing COW. + */ + ptep_clear_flush(vma, address, page_table); + set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); @@ -1566,6 +1607,10 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty_balance(dirty_page); + put_page(dirty_page); + } return ret; oom: if (old_page) @@ -1785,9 +1830,10 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Handle all mappings that got truncated by a "truncate()" - * system call. +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating * * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last @@ -1856,11 +1902,16 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) } EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ -/* +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @addr: address to start + * @vma: user vma this addresses belong to + * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue - * the 'original' request together with the readahead ones... + * the 'original' request together with the readahead ones... * * This has been extended to use the NUMA policies from the mm triggering * the readahead. @@ -2098,6 +2149,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + struct page *dirty_page = NULL; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); @@ -2192,6 +2244,10 @@ retry: } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + if (write_access) { + dirty_page = new_page; + get_page(dirty_page); + } } } else { /* One of our sibling threads was faster, back out. */ @@ -2204,6 +2260,10 @@ retry: lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty_balance(dirty_page); + put_page(dirty_page); + } return ret; oom: page_cache_release(new_page); @@ -2211,6 +2271,54 @@ oom: } /* + * do_no_pfn() tries to create a new page mapping for a page without + * a struct_page backing it + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + * + * It is expected that the ->nopfn handler always returns the same pfn + * for a given virtual mapping. + * + * Mark this `noinline' to prevent it from bloating the main pagefault code. + */ +static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) +{ + spinlock_t *ptl; + pte_t entry; + unsigned long pfn; + int ret = VM_FAULT_MINOR; + + pte_unmap(page_table); + BUG_ON(!(vma->vm_flags & VM_PFNMAP)); + BUG_ON(is_cow_mapping(vma->vm_flags)); + + pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); + if (pfn == NOPFN_OOM) + return VM_FAULT_OOM; + if (pfn == NOPFN_SIGBUS) + return VM_FAULT_SIGBUS; + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + entry = pfn_pte(pfn, vma->vm_page_prot); + if (write_access) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + set_pte_at(mm, address, page_table, entry); + } + pte_unmap_unlock(page_table, ptl); + return ret; +} + +/* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. @@ -2272,11 +2380,17 @@ static inline int handle_pte_fault(struct mm_struct *mm, old_entry = entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, address, - pte, pmd, write_access); - return do_no_page(mm, vma, address, - pte, pmd, write_access); + if (vma->vm_ops) { + if (vma->vm_ops->nopage) + return do_no_page(mm, vma, address, + pte, pmd, + write_access); + if (unlikely(vma->vm_ops->nopfn)) + return do_no_pfn(mm, vma, address, pte, + pmd, write_access); + } + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); } if (pte_file(entry)) return do_file_page(mm, vma, address, @@ -2505,3 +2619,56 @@ int in_gate_area_no_task(unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct page *page; + void *old_buf = buf; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset; + void *maddr; + + ret = get_user_pages(tsk, mm, addr, 1, + write, 1, &page, &vma); + if (ret <= 0) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + up_read(&mm->mmap_sem); + mmput(mm); + + return buf - old_buf; +} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c37319542b70..fd678a662eae 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -13,6 +13,7 @@ #include <linux/compiler.h> #include <linux/module.h> #include <linux/pagevec.h> +#include <linux/writeback.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/cpu.h> @@ -21,11 +22,41 @@ #include <linux/highmem.h> #include <linux/vmalloc.h> #include <linux/ioport.h> +#include <linux/cpuset.h> #include <asm/tlbflush.h> -extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, - unsigned long size); +/* add this memory to iomem resource */ +static struct resource *register_memory_resource(u64 start, u64 size) +{ + struct resource *res; + res = kzalloc(sizeof(struct resource), GFP_KERNEL); + BUG_ON(!res); + + res->name = "System RAM"; + res->start = start; + res->end = start + size - 1; + res->flags = IORESOURCE_MEM; + if (request_resource(&iomem_resource, res) < 0) { + printk("System RAM resource %llx - %llx cannot be added\n", + (unsigned long long)res->start, (unsigned long long)res->end); + kfree(res); + res = NULL; + } + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); + return; +} + + +#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; @@ -45,8 +76,6 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages); static int __add_section(struct zone *zone, unsigned long phys_start_pfn) { int nr_pages = PAGES_PER_SECTION; @@ -191,8 +220,10 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (need_zonelists_rebuild) build_all_zonelists(); vm_total_pages = nr_free_pagecache_pages(); + writeback_set_ratelimit(); return 0; } +#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ static pg_data_t *hotadd_new_pgdat(int nid, u64 start) { @@ -222,36 +253,6 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) return; } -/* add this memory to iomem resource */ -static struct resource *register_memory_resource(u64 start, u64 size) -{ - struct resource *res; - res = kzalloc(sizeof(struct resource), GFP_KERNEL); - BUG_ON(!res); - - res->name = "System RAM"; - res->start = start; - res->end = start + size - 1; - res->flags = IORESOURCE_MEM; - if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %llx - %llx cannot be added\n", - (unsigned long long)res->start, (unsigned long long)res->end); - kfree(res); - res = NULL; - } - return res; -} - -static void release_memory_resource(struct resource *res) -{ - if (!res) - return; - release_resource(res); - kfree(res); - return; -} - - int add_memory(int nid, u64 start, u64 size) { @@ -283,6 +284,8 @@ int add_memory(int nid, u64 start, u64 size) /* we online node here. we can't roll back from here. */ node_set_online(nid); + cpuset_track_online_nodes(); + if (new_pgdat) { ret = register_one_node(nid); /* diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a9963ceddd65..25788b1b7fcf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -105,7 +105,7 @@ static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ -int policy_zone = ZONE_DMA; +enum zone_type policy_zone = ZONE_DMA; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ @@ -137,7 +137,8 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; - int num, max, nd, k; + int num, max, nd; + enum zone_type k; max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); @@ -148,12 +149,16 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) lower zones etc. Avoid empty zones because the memory allocator doesn't like them. If you implement node hot removal you have to fix that. */ - for (k = policy_zone; k >= 0; k--) { + k = policy_zone; + while (1) { for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; if (z->present_pages > 0) zl->zones[num++] = z; } + if (k == 0) + break; + k--; } zl->zones[num] = NULL; return zl; @@ -482,7 +487,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + node_set(zone_to_nid(p->v.zonelist->zones[i]), *nodes); break; case MPOL_DEFAULT: @@ -1131,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) */ unsigned slab_node(struct mempolicy *policy) { - switch (policy->policy) { + int pol = policy ? policy->policy : MPOL_DEFAULT; + + switch (pol) { case MPOL_INTERLEAVE: return interleave_nodes(policy); @@ -1140,7 +1147,7 @@ unsigned slab_node(struct mempolicy *policy) * Follow bind policy behavior and start allocation at the * first node. */ - return policy->v.zonelist->zones[0]->zone_pgdat->node_id; + return zone_to_nid(policy->v.zonelist->zones[0]); case MPOL_PREFERRED: if (policy->v.preferred_node >= 0) @@ -1285,7 +1292,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) if ((gfp & __GFP_WAIT) && !in_interrupt()) cpuset_update_task_memory_state(); - if (!pol || in_interrupt()) + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) return alloc_page_interleave(gfp, order, interleave_nodes(pol)); @@ -1317,12 +1324,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { int sz = ksize(old->v.zonelist); - new->v.zonelist = kmalloc(sz, SLAB_KERNEL); + new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL); if (!new->v.zonelist) { kmem_cache_free(policy_cache, new); return ERR_PTR(-ENOMEM); } - memcpy(new->v.zonelist, old->v.zonelist, sz); } return new; } @@ -1644,7 +1650,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) nodes_clear(nodes); for (z = pol->v.zonelist->zones; *z; z++) - node_set((*z)->zone_pgdat->node_id, nodes); + node_set(zone_to_nid(*z), nodes); nodes_remap(tmp, nodes, *mpolmask, *newmask); nodes = tmp; diff --git a/mm/migrate.c b/mm/migrate.c index 3f1e0c2c942c..ba2453f9483d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -409,6 +409,7 @@ int migrate_page(struct address_space *mapping, } EXPORT_SYMBOL(migrate_page); +#ifdef CONFIG_BLOCK /* * Migration function for pages with buffers. This function can only be used * if the underlying filesystem guarantees that no other references to "page" @@ -466,6 +467,7 @@ int buffer_migrate_page(struct address_space *mapping, return 0; } EXPORT_SYMBOL(buffer_migrate_page); +#endif /* * Writeback a page to clean the dirty state @@ -525,7 +527,7 @@ static int fallback_migrate_page(struct address_space *mapping, * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_buffers(page) && + if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; @@ -741,7 +743,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); + return alloc_pages_node(pm->node, GFP_HIGHUSER | GFP_THISNODE, 0); } /* diff --git a/mm/mmap.c b/mm/mmap.c index e66a0b524aff..eea8eefd51a8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -64,6 +64,13 @@ pgprot_t protection_map[16] = { __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + return protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; +} +EXPORT_SYMBOL(vm_get_page_prot); + int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; @@ -109,7 +116,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ - free += atomic_read(&slab_reclaim_pages); + free += global_page_state(NR_SLAB_RECLAIMABLE); /* * Leave the last 3% for root @@ -1098,12 +1105,6 @@ munmap_back: goto free_vma; } - /* Don't make the VMA automatically writable if it's shared, but the - * backer wishes to know when pages are first written to */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) - vma->vm_page_prot = - protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; - /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) * that memory reservation must be checked; but that reservation @@ -1121,6 +1122,10 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; + if (vma_wants_writenotify(vma)) + vma->vm_page_prot = + protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; + if (!file || !vma_merge(mm, prev, addr, vma->vm_end, vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { file = vma->vm_file; diff --git a/mm/mprotect.c b/mm/mprotect.c index 638edabaff71..3b8f3c0c63f3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -27,12 +27,14 @@ #include <asm/tlbflush.h> static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot) + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) { pte_t *pte, oldpte; spinlock_t *ptl; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; if (pte_present(oldpte)) { @@ -42,7 +44,14 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, * bits by wiping the pte and then setting the new pte * into place. */ - ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); + ptent = ptep_get_and_clear(mm, addr, pte); + ptent = pte_modify(ptent, newprot); + /* + * Avoid taking write faults for pages we know to be + * dirty. + */ + if (dirty_accountable && pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); set_pte_at(mm, addr, pte, ptent); lazy_mmu_prot_update(ptent); #ifdef CONFIG_MIGRATION @@ -62,11 +71,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, } } while (pte++, addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); } static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, pgprot_t newprot) + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) { pmd_t *pmd; unsigned long next; @@ -76,12 +87,13 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot); + change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); } while (pmd++, addr = next, addr != end); } static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, pgprot_t newprot) + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) { pud_t *pud; unsigned long next; @@ -91,12 +103,13 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot); + change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); } while (pud++, addr = next, addr != end); } static void change_protection(struct vm_area_struct *vma, - unsigned long addr, unsigned long end, pgprot_t newprot) + unsigned long addr, unsigned long end, pgprot_t newprot, + int dirty_accountable) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -110,7 +123,7 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot); + change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } @@ -123,10 +136,9 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; - unsigned int mask; - pgprot_t newprot; pgoff_t pgoff; int error; + int dirty_accountable = 0; if (newflags == oldflags) { *pprev = vma; @@ -176,24 +188,23 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } success: - /* Don't make the VMA automatically writable if it's shared, but the - * backer wishes to know when pages are first written to */ - mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; - if (vma->vm_ops && vma->vm_ops->page_mkwrite) - mask &= ~VM_SHARED; - - newprot = protection_map[newflags & mask]; - /* * vm_flags and vm_page_prot are protected by the mmap_sem * held in write mode. */ vma->vm_flags = newflags; - vma->vm_page_prot = newprot; + vma->vm_page_prot = protection_map[newflags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; + if (vma_wants_writenotify(vma)) { + vma->vm_page_prot = protection_map[newflags & + (VM_READ|VM_WRITE|VM_EXEC)]; + dirty_accountable = 1; + } + if (is_vm_hugetlb_page(vma)) - hugetlb_change_protection(vma, start, end, newprot); + hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else - change_protection(vma, start, end, newprot); + change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 7c15cf3373ad..9c769fa29f32 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -98,6 +98,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { @@ -109,6 +110,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, set_pte_at(mm, new_addr, new_pte, pte); } + arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap_nested(new_pte - 1); diff --git a/mm/msync.c b/mm/msync.c index d083544df21b..358d73cf7b78 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -7,149 +7,33 @@ /* * The msync() system call. */ -#include <linux/slab.h> -#include <linux/pagemap.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mman.h> -#include <linux/hugetlb.h> -#include <linux/writeback.h> #include <linux/file.h> #include <linux/syscalls.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> - -static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end) -{ - pte_t *pte; - spinlock_t *ptl; - int progress = 0; - unsigned long ret = 0; - -again: - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - do { - struct page *page; - - if (progress >= 64) { - progress = 0; - if (need_resched() || need_lockbreak(ptl)) - break; - } - progress++; - if (!pte_present(*pte)) - continue; - if (!pte_maybe_dirty(*pte)) - continue; - page = vm_normal_page(vma, addr, *pte); - if (!page) - continue; - if (ptep_clear_flush_dirty(vma, addr, pte) || - page_test_and_clear_dirty(page)) - ret += set_page_dirty(page); - progress += 3; - } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - if (addr != end) - goto again; - return ret; -} - -static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, - pud_t *pud, unsigned long addr, unsigned long end) -{ - pmd_t *pmd; - unsigned long next; - unsigned long ret = 0; - - pmd = pmd_offset(pud, addr); - do { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - ret += msync_pte_range(vma, pmd, addr, next); - } while (pmd++, addr = next, addr != end); - return ret; -} - -static inline unsigned long msync_pud_range(struct vm_area_struct *vma, - pgd_t *pgd, unsigned long addr, unsigned long end) -{ - pud_t *pud; - unsigned long next; - unsigned long ret = 0; - - pud = pud_offset(pgd, addr); - do { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - ret += msync_pmd_range(vma, pud, addr, next); - } while (pud++, addr = next, addr != end); - return ret; -} - -static unsigned long msync_page_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - pgd_t *pgd; - unsigned long next; - unsigned long ret = 0; - - /* For hugepages we can't go walking the page table normally, - * but that's ok, hugetlbfs is memory based, so we don't need - * to do anything more on an msync(). - */ - if (vma->vm_flags & VM_HUGETLB) - return 0; - - BUG_ON(addr >= end); - pgd = pgd_offset(vma->vm_mm, addr); - flush_cache_range(vma, addr, end); - do { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - ret += msync_pud_range(vma, pgd, addr, next); - } while (pgd++, addr = next, addr != end); - return ret; -} - /* * MS_SYNC syncs the entire file - including mappings. * - * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just - * marks the relevant pages dirty. The application may now run fsync() to + * MS_ASYNC does not start I/O (it used to, up to 2.5.67). + * Nor does it marks the relevant pages dirty (it used to up to 2.6.17). + * Now it doesn't do anything, since dirty pages are properly tracked. + * + * The application may now run fsync() to * write out the dirty pages and wait on the writeout and check the result. * Or the application may run fadvise(FADV_DONTNEED) against the fd to start * async writeout immediately. * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -static int msync_interval(struct vm_area_struct *vma, unsigned long addr, - unsigned long end, int flags, - unsigned long *nr_pages_dirtied) -{ - struct file *file = vma->vm_file; - - if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) - return -EBUSY; - - if (file && (vma->vm_flags & VM_SHARED)) - *nr_pages_dirtied = msync_page_range(vma, addr, end); - return 0; -} - asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int unmapped_error = 0; int error = -EINVAL; - int done = 0; if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; @@ -169,64 +53,50 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) * If the interval [start,end) covers some unmapped address ranges, * just ignore them, but return -ENOMEM at the end. */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, start); - if (!vma) { - error = -ENOMEM; - goto out_unlock; - } - do { - unsigned long nr_pages_dirtied = 0; + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + for (;;) { struct file *file; + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + goto out_unlock; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { - unmapped_error = -ENOMEM; start = vma->vm_start; + if (start >= end) + goto out_unlock; + unmapped_error = -ENOMEM; } /* Here vma->vm_start <= start < vma->vm_end. */ - if (end <= vma->vm_end) { - if (start < end) { - error = msync_interval(vma, start, end, flags, - &nr_pages_dirtied); - if (error) - goto out_unlock; - } - error = unmapped_error; - done = 1; - } else { - /* Here vma->vm_start <= start < vma->vm_end < end. */ - error = msync_interval(vma, start, vma->vm_end, flags, - &nr_pages_dirtied); - if (error) - goto out_unlock; + if ((flags & MS_INVALIDATE) && + (vma->vm_flags & VM_LOCKED)) { + error = -EBUSY; + goto out_unlock; } file = vma->vm_file; start = vma->vm_end; - if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { - get_file(file); - up_read(¤t->mm->mmap_sem); - balance_dirty_pages_ratelimited_nr(file->f_mapping, - nr_pages_dirtied); - fput(file); - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, start); - } else if ((flags & MS_SYNC) && file && + if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); error = do_fsync(file, 0); fput(file); - down_read(¤t->mm->mmap_sem); - if (error) - goto out_unlock; - vma = find_vma(current->mm, start); + if (error || start >= end) + goto out; + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); } else { + if (start >= end) { + error = 0; + goto out_unlock; + } vma = vma->vm_next; } - } while (vma && !done); + } out_unlock: - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); out: - return error; + return error ? : unmapped_error; } diff --git a/mm/nommu.c b/mm/nommu.c index c576df71e3bb..365019599df8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -122,26 +122,50 @@ unsigned int kobjsize(const void *objp) } /* - * The nommu dodgy version :-) + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) { + struct vm_area_struct *vma; + unsigned long vm_flags; int i; - static struct vm_area_struct dummy_vma; + + /* calculate required read or write permissions. + * - if 'force' is set, we only require the "MAY" flags. + */ + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); for (i = 0; i < len; i++) { + vma = find_vma(mm, start); + if (!vma) + goto finish_or_fault; + + /* protect what we can, including chardevs */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP) || + !(vm_flags & vma->vm_flags)) + goto finish_or_fault; + if (pages) { pages[i] = virt_to_page(start); if (pages[i]) page_cache_get(pages[i]); } if (vmas) - vmas[i] = &dummy_vma; + vmas[i] = vma; start += PAGE_SIZE; } - return(i); + + return i; + +finish_or_fault: + return i ? : -EFAULT; } EXPORT_SYMBOL(get_user_pages); @@ -286,6 +310,77 @@ static void show_process_blocks(void) } #endif /* DEBUG */ +/* + * add a VMA into a process's mm_struct in the appropriate place in the list + * - should be called with mm->mmap_sem held writelocked + */ +static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) +{ + struct vm_list_struct **ppv; + + for (ppv = ¤t->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) + if ((*ppv)->vma->vm_start > vml->vma->vm_start) + break; + + vml->next = *ppv; + *ppv = vml; +} + +/* + * look up the first VMA in which addr resides, NULL if none + * - should be called with mm->mmap_sem at least held readlocked + */ +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_list_struct *loop, *vml; + + /* search the vm_start ordered list */ + vml = NULL; + for (loop = mm->context.vmlist; loop; loop = loop->next) { + if (loop->vma->vm_start > addr) + break; + vml = loop; + } + + if (vml && vml->vma->vm_end > addr) + return vml->vma; + + return NULL; +} +EXPORT_SYMBOL(find_vma); + +/* + * find a VMA + * - we don't extend stack VMAs under NOMMU conditions + */ +struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +/* + * look up the first VMA exactly that exactly matches addr + * - should be called with mm->mmap_sem at least held readlocked + */ +static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, + unsigned long addr) +{ + struct vm_list_struct *vml; + + /* search the vm_start ordered list */ + for (vml = mm->context.vmlist; vml; vml = vml->next) { + if (vml->vma->vm_start == addr) + return vml->vma; + if (vml->vma->vm_start > addr) + break; + } + + return NULL; +} + +/* + * find a VMA in the global tree + */ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) { struct vm_area_struct *vma; @@ -305,6 +400,9 @@ static inline struct vm_area_struct *find_nommu_vma(unsigned long start) return NULL; } +/* + * add a VMA in the global tree + */ static void add_nommu_vma(struct vm_area_struct *vma) { struct vm_area_struct *pvma; @@ -351,6 +449,9 @@ static void add_nommu_vma(struct vm_area_struct *vma) rb_insert_color(&vma->vm_rb, &nommu_vma_tree); } +/* + * delete a VMA from the global list + */ static void delete_nommu_vma(struct vm_area_struct *vma) { struct address_space *mapping; @@ -828,8 +929,7 @@ unsigned long do_mmap_pgoff(struct file *file, realalloc += kobjsize(vml); askedalloc += sizeof(*vml); - vml->next = current->mm->context.vmlist; - current->mm->context.vmlist = vml; + add_vma_to_mm(current->mm, vml); up_write(&nommu_vma_sem); @@ -848,7 +948,8 @@ unsigned long do_mmap_pgoff(struct file *file, up_write(&nommu_vma_sem); kfree(vml); if (vma) { - fput(vma->vm_file); + if (vma->vm_file) + fput(vma->vm_file); kfree(vma); } return ret; @@ -908,6 +1009,11 @@ static void put_vma(struct vm_area_struct *vma) } } +/* + * release a mapping + * - under NOMMU conditions the parameters must match exactly to the mapping to + * be removed + */ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) { struct vm_list_struct *vml, **parent; @@ -917,10 +1023,13 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) printk("do_munmap:\n"); #endif - for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) + for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { + if ((*parent)->vma->vm_start > addr) + break; if ((*parent)->vma->vm_start == addr && ((len == 0) || ((*parent)->vma->vm_end == end))) goto found; + } printk("munmap of non-mmaped memory by process %d (%s): %p\n", current->pid, current->comm, (void *) addr); @@ -946,7 +1055,20 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) return 0; } -/* Release all mmaps. */ +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * Release all mappings + */ void exit_mmap(struct mm_struct * mm) { struct vm_list_struct *tmp; @@ -973,37 +1095,26 @@ void exit_mmap(struct mm_struct * mm) } } -asmlinkage long sys_munmap(unsigned long addr, size_t len) -{ - int ret; - struct mm_struct *mm = current->mm; - - down_write(&mm->mmap_sem); - ret = do_munmap(mm, addr, len); - up_write(&mm->mmap_sem); - return ret; -} - unsigned long do_brk(unsigned long addr, unsigned long len) { return -ENOMEM; } /* - * Expand (or shrink) an existing mapping, potentially moving it at the - * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) + * expand (or shrink) an existing mapping, potentially moving it at the same + * time (controlled by the MREMAP_MAYMOVE flag and available VM space) * - * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise - * This option implies MREMAP_MAYMOVE. + * under NOMMU conditions, we only permit changing a mapping's size, and only + * as long as it stays within the hole allocated by the kmalloc() call in + * do_mmap_pgoff() and the block is not shareable * - * on uClinux, we only permit changing a mapping's size, and only as long as it stays within the - * hole allocated by the kmalloc() call in do_mmap_pgoff() and the block is not shareable + * MREMAP_FIXED is not supported under NOMMU conditions */ unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { - struct vm_list_struct *vml = NULL; + struct vm_area_struct *vma; /* insanity checks first */ if (new_len == 0) @@ -1012,58 +1123,46 @@ unsigned long do_mremap(unsigned long addr, if (flags & MREMAP_FIXED && new_addr != addr) return (unsigned long) -EINVAL; - for (vml = current->mm->context.vmlist; vml; vml = vml->next) - if (vml->vma->vm_start == addr) - goto found; - - return (unsigned long) -EINVAL; + vma = find_vma_exact(current->mm, addr); + if (!vma) + return (unsigned long) -EINVAL; - found: - if (vml->vma->vm_end != vml->vma->vm_start + old_len) + if (vma->vm_end != vma->vm_start + old_len) return (unsigned long) -EFAULT; - if (vml->vma->vm_flags & VM_MAYSHARE) + if (vma->vm_flags & VM_MAYSHARE) return (unsigned long) -EPERM; if (new_len > kobjsize((void *) addr)) return (unsigned long) -ENOMEM; /* all checks complete - do it */ - vml->vma->vm_end = vml->vma->vm_start + new_len; + vma->vm_end = vma->vm_start + new_len; askedalloc -= old_len; askedalloc += new_len; - return vml->vma->vm_start; + return vma->vm_start; } -/* - * Look up the first VMA which satisfies addr < vm_end, NULL if none - */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +asmlinkage unsigned long sys_mremap(unsigned long addr, + unsigned long old_len, unsigned long new_len, + unsigned long flags, unsigned long new_addr) { - struct vm_list_struct *vml; - - for (vml = mm->context.vmlist; vml; vml = vml->next) - if (addr >= vml->vma->vm_start && addr < vml->vma->vm_end) - return vml->vma; + unsigned long ret; - return NULL; + down_write(¤t->mm->mmap_sem); + ret = do_mremap(addr, old_len, new_len, flags, new_addr); + up_write(¤t->mm->mmap_sem); + return ret; } -EXPORT_SYMBOL(find_vma); - struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { return NULL; } -struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) -{ - return NULL; -} - int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot) { @@ -1133,7 +1232,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ - free += atomic_read(&slab_reclaim_pages); + free += global_page_state(NR_SLAB_RECLAIMABLE); /* * Leave the last 3% for root @@ -1206,3 +1305,44 @@ struct page *filemap_nopage(struct vm_area_struct *area, BUG(); return NULL; } + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_read(&mm->mmap_sem); + + /* the access must start within one of the target process's mappings */ + vma = find_vma(mm, addr); + if (vma) { + /* don't overrun this mapping */ + if (addr + len >= vma->vm_end) + len = vma->vm_end - addr; + + /* only read or write mappings where it is permitted */ + if (write && vma->vm_flags & VM_MAYWRITE) + len -= copy_to_user((void *) addr, buf, len); + else if (!write && vma->vm_flags & VM_MAYREAD) + len -= copy_from_user(buf, (void *) addr, len); + else + len = 0; + } else { + len = 0; + } + + up_read(&mm->mmap_sem); + mmput(mm); + return len; +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b9af136e5cfa..20f41b082e16 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -21,6 +21,8 @@ #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/cpuset.h> +#include <linux/module.h> +#include <linux/notifier.h> int sysctl_panic_on_oom; /* #define DEBUG */ @@ -58,6 +60,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) } /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + + /* * The memory size of the process is the basis for the badness. */ points = mm->total_vm; @@ -127,6 +135,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 4; /* + * If p's nodes don't overlap ours, it may still help to kill p + * because p may have allocated or otherwise mapped memory on + * this node before. However it will be less likely. + */ + if (!cpuset_excl_nodes_overlap(p)) + points /= 8; + + /* * Adjust the score by oomkilladj. */ if (p->oomkilladj) { @@ -161,8 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) for (z = zonelist->zones; *z; z++) if (cpuset_zone_allowed(*z, gfp_mask)) - node_clear((*z)->zone_pgdat->node_id, - nodes); + node_clear(zone_to_nid(*z), nodes); else return CONSTRAINT_CPUSET; @@ -189,27 +204,49 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) do_posix_clock_monotonic_gettime(&uptime); do_each_thread(g, p) { unsigned long points; - int releasing; - /* skip the init task with pid == 1 */ - if (p->pid == 1) - continue; - if (p->oomkilladj == OOM_DISABLE) + /* + * skip kernel threads and tasks which have already released + * their mm. + */ + if (!p->mm) continue; - /* If p's nodes don't overlap ours, it won't help to kill p. */ - if (!cpuset_excl_nodes_overlap(p)) + /* skip the init task */ + if (is_init(p)) continue; /* + * This task already has access to memory reserves and is + * being killed. Don't allow any other task access to the + * memory reserve. + * + * Note: this may have a chance of deadlock if it gets + * blocked waiting for another task which itself is waiting + * for memory. Is there a better alternative? + */ + if (test_tsk_thread_flag(p, TIF_MEMDIE)) + return ERR_PTR(-1UL); + + /* * This is in the process of releasing memory so wait for it * to finish before killing some other task by mistake. + * + * However, if p is the current task, we allow the 'kill' to + * go ahead if it is exiting: this will simply set TIF_MEMDIE, + * which will allow it to gain access to memory reserves in + * the process of exiting and releasing its resources. + * Otherwise we could get an easy OOM deadlock. */ - releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || - p->flags & PF_EXITING; - if (releasing && !(p->flags & PF_DEAD)) - return ERR_PTR(-1UL); - if (p->flags & PF_SWAPOFF) - return p; + if (p->flags & PF_EXITING) { + if (p != current) + return ERR_PTR(-1UL); + + chosen = p; + *ppoints = ULONG_MAX; + } + + if (p->oomkilladj == OOM_DISABLE) + continue; points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { @@ -217,32 +254,33 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) *ppoints = points; } } while_each_thread(g, p); + return chosen; } /** - * We must be careful though to never send SIGKILL a process with - * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that - * we select a process with CAP_SYS_RAW_IO set). + * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO + * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO + * set. */ static void __oom_kill_task(struct task_struct *p, const char *message) { - if (p->pid == 1) { + if (is_init(p)) { WARN_ON(1); printk(KERN_WARNING "tried to kill init!\n"); return; } - task_lock(p); - if (!p->mm || p->mm == &init_mm) { + if (!p->mm) { WARN_ON(1); printk(KERN_WARNING "tried to kill an mm-less task!\n"); - task_unlock(p); return; } - task_unlock(p); - printk(KERN_ERR "%s: Killed process %d (%s).\n", + + if (message) { + printk(KERN_ERR "%s: Killed process %d (%s).\n", message, p->pid, p->comm); + } /* * We give our sacrificial lamb high priority and access to @@ -271,7 +309,7 @@ static int oom_kill_task(struct task_struct *p, const char *message) * However, this is of no concern to us. */ - if (mm == NULL || mm == &init_mm) + if (mm == NULL) return 1; __oom_kill_task(p, message); @@ -293,8 +331,17 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, struct task_struct *c; struct list_head *tsk; - printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " - "children.\n", p->pid, p->comm, points); + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just set TIF_MEMDIE so it can die quickly + */ + if (p->flags & PF_EXITING) { + __oom_kill_task(p, NULL); + return 0; + } + + printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" + " and children.\n", p->pid, p->comm, points); /* Try to kill a child first */ list_for_each(tsk, &p->children) { c = list_entry(tsk, struct task_struct, sibling); @@ -306,6 +353,20 @@ static int oom_kill_process(struct task_struct *p, unsigned long points, return oom_kill_task(p, message); } +static BLOCKING_NOTIFIER_HEAD(oom_notify_list); + +int register_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_oom_notifier); + +int unregister_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_oom_notifier); + /** * out_of_memory - kill the "best" process when we run out of memory * @@ -318,10 +379,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct task_struct *p; unsigned long points = 0; + unsigned long freed = 0; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; if (printk_ratelimit()) { - printk("oom-killer: gfp_mask=0x%x, order=%d\n", - gfp_mask, order); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); dump_stack(); show_mem(); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e630188ccc40..c0d4ce144dec 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -23,12 +23,15 @@ #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/buffer_head.h> +#include <linux/pagevec.h> /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -45,7 +48,6 @@ */ static long ratelimit_pages = 32; -static long total_pages; /* The total number of pages in the machine. */ static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ /* @@ -125,7 +127,7 @@ get_dirty_limits(long *pbackground, long *pdirty, int unmapped_ratio; long background; long dirty; - unsigned long available_memory = total_pages; + unsigned long available_memory = vm_total_pages; struct task_struct *tsk; #ifdef CONFIG_HIGHMEM @@ -140,7 +142,7 @@ get_dirty_limits(long *pbackground, long *pdirty, unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) + global_page_state(NR_ANON_PAGES)) * 100) / - total_pages; + vm_total_pages; dirty_ratio = vm_dirty_ratio; if (dirty_ratio > unmapped_ratio / 2) @@ -243,6 +245,16 @@ static void balance_dirty_pages(struct address_space *mapping) pdflush_operation(background_writeout, 0); } +void set_page_dirty_balance(struct page *page) +{ + if (set_page_dirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping) + balance_dirty_pages_ratelimited(mapping); + } +} + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -491,9 +503,9 @@ void laptop_sync_completion(void) * will write six megabyte chunks, max. */ -static void set_ratelimit(void) +void writeback_set_ratelimit(void) { - ratelimit_pages = total_pages / (num_online_cpus() * 32); + ratelimit_pages = vm_total_pages / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024) @@ -503,7 +515,7 @@ static void set_ratelimit(void) static int __cpuinit ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) { - set_ratelimit(); + writeback_set_ratelimit(); return 0; } @@ -522,9 +534,7 @@ void __init page_writeback_init(void) long buffer_pages = nr_free_buffer_pages(); long correction; - total_pages = nr_free_pagecache_pages(); - - correction = (100 * 4 * buffer_pages) / total_pages; + correction = (100 * 4 * buffer_pages) / vm_total_pages; if (correction < 100) { dirty_background_ratio *= correction; @@ -538,10 +548,143 @@ void __init page_writeback_init(void) vm_dirty_ratio = 1; } mod_timer(&wb_timer, jiffies + dirty_writeback_interval); - set_ratelimit(); + writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); } +/** + * generic_writepages - walk the list of dirty pages of the given + * address space and writepage() all of them. + * + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * + * This is a library function, which implements the writepages() + * address_space_operation. + * + * If a page is already under I/O, generic_writepages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + * + * Derived from mpage_writepages() - if you fix this you should check that + * also! + */ +int generic_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + int (*writepage)(struct page *page, struct writeback_control *wbc); + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + writepage = mapping->a_ops->writepage; + + /* deal with chardevs and other special file */ + if (!writepage) + return 0; + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc); + if (ret) { + if (ret == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); + } + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) + unlock_page(page); + if (ret || (--(wbc->nr_to_write) <= 0)) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} + +EXPORT_SYMBOL(generic_writepages); + int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; @@ -550,7 +693,7 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) return 0; wbc->for_writepages = 1; if (mapping->a_ops->writepages) - ret = mapping->a_ops->writepages(mapping, wbc); + ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); wbc->for_writepages = 0; @@ -664,9 +807,11 @@ int fastcall set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - return __set_page_dirty_buffers(page); +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); } if (!PageDirty(page)) { if (!TestSetPageDirty(page)) @@ -690,7 +835,7 @@ int set_page_dirty_lock(struct page *page) { int ret; - lock_page(page); + lock_page_nosync(page); ret = set_page_dirty(page); unlock_page(page); return ret; @@ -712,9 +857,15 @@ int test_clear_page_dirty(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - if (mapping_cap_account_dirty(mapping)) - __dec_zone_page_state(page, NR_FILE_DIRTY); write_unlock_irqrestore(&mapping->tree_lock, flags); + /* + * We can continue to use `mapping' here because the + * page is locked, which pins the address_space + */ + if (mapping_cap_account_dirty(mapping)) { + page_mkclean(page); + dec_zone_page_state(page, NR_FILE_DIRTY); + } return 1; } write_unlock_irqrestore(&mapping->tree_lock, flags); @@ -744,8 +895,10 @@ int clear_page_dirty_for_io(struct page *page) if (mapping) { if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + page_mkclean(page); dec_zone_page_state(page, NR_FILE_DIRTY); + } return 1; } return 0; @@ -803,6 +956,15 @@ int test_set_page_writeback(struct page *page) EXPORT_SYMBOL(test_set_page_writeback); /* + * Wakes up tasks that are being throttled due to writeback congestion + */ +void writeback_congestion_end(void) +{ + blk_congestion_end(WRITE); +} +EXPORT_SYMBOL(writeback_congestion_end); + +/* * Return true if any of the pages in the mapping are marged with the * passed tag. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 54a4f5375bba..4f59d90b81e6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -37,6 +37,8 @@ #include <linux/vmalloc.h> #include <linux/mempolicy.h> #include <linux/stop_machine.h> +#include <linux/sort.h> +#include <linux/pfn.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -51,7 +53,6 @@ EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); unsigned long totalram_pages __read_mostly; -unsigned long totalhigh_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; @@ -69,7 +70,15 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { + 256, +#ifdef CONFIG_ZONE_DMA32 + 256, +#endif +#ifdef CONFIG_HIGHMEM + 32 +#endif +}; EXPORT_SYMBOL(totalram_pages); @@ -80,11 +89,53 @@ EXPORT_SYMBOL(totalram_pages); struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; EXPORT_SYMBOL(zone_table); -static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; +static char *zone_names[MAX_NR_ZONES] = { + "DMA", +#ifdef CONFIG_ZONE_DMA32 + "DMA32", +#endif + "Normal", +#ifdef CONFIG_HIGHMEM + "HighMem" +#endif +}; + int min_free_kbytes = 1024; unsigned long __meminitdata nr_kernel_pages; unsigned long __meminitdata nr_all_pages; +static unsigned long __initdata dma_reserve; + +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct + * ranges of memory (RAM) that may be registered with add_active_range(). + * Ranges passed to add_active_range() will be merged if possible + * so the number of times add_active_range() can be called is + * related to the number of nodes and the number of holes + */ + #ifdef CONFIG_MAX_ACTIVE_REGIONS + /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ + #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS + #else + #if MAX_NUMNODES >= 32 + /* If there can be many nodes, allow up to 50 holes per node */ + #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) + #else + /* By default, allow up to 256 distinct regions */ + #define MAX_ACTIVE_REGIONS 256 + #endif + #endif + + struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; + int __initdata nr_nodemap_entries; + unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; + unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; + unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -127,7 +178,6 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } - #else static inline int bad_range(struct zone *zone, struct page *page) { @@ -218,12 +268,12 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) { int i; - BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); /* * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO * and __GFP_HIGHMEM from hard or soft interrupt context. */ - BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); for (i = 0; i < (1 << order); i++) clear_highpage(page + i); } @@ -347,8 +397,8 @@ static inline void __free_one_page(struct page *page, page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - BUG_ON(page_idx & (order_size - 1)); - BUG_ON(bad_range(zone, page)); + VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(bad_range(zone, page)); zone->free_pages += order_size; while (order < MAX_ORDER-1) { @@ -421,7 +471,7 @@ static void free_pages_bulk(struct zone *zone, int count, while (count--) { struct page *page; - BUG_ON(list_empty(list)); + VM_BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); @@ -432,9 +482,11 @@ static void free_pages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, int order) { - LIST_HEAD(list); - list_add(&page->lru, &list); - free_pages_bulk(zone, 1, &list, order); + spin_lock(&zone->lock); + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + __free_one_page(page, zone ,order); + spin_unlock(&zone->lock); } static void __free_pages_ok(struct page *page, unsigned int order) @@ -512,7 +564,7 @@ static inline void expand(struct zone *zone, struct page *page, area--; high--; size >>= 1; - BUG_ON(bad_range(zone, &page[size])); + VM_BUG_ON(bad_range(zone, &page[size])); list_add(&page[size].lru, &area->free_list); area->nr_free++; set_page_order(&page[size], high); @@ -615,19 +667,23 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, #ifdef CONFIG_NUMA /* * Called from the slab reaper to drain pagesets on a particular node that - * belong to the currently executing processor. + * belongs to the currently executing processor. * Note that this function must be called with the thread pinned to * a single processor. */ void drain_node_pages(int nodeid) { - int i, z; + int i; + enum zone_type z; unsigned long flags; for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; + if (!populated_zone(zone)) + continue; + pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -672,7 +728,8 @@ static void __drain_pages(unsigned int cpu) void mark_free_pages(struct zone *zone) { - unsigned long zone_pfn, flags; + unsigned long pfn, max_zone_pfn; + unsigned long flags; int order; struct list_head *curr; @@ -680,18 +737,25 @@ void mark_free_pages(struct zone *zone) return; spin_lock_irqsave(&zone->lock, flags); - for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) - ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn)); + + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (!PageNosave(page)) + ClearPageNosaveFree(page); + } for (order = MAX_ORDER - 1; order >= 0; --order) list_for_each(curr, &zone->free_area[order].free_list) { - unsigned long start_pfn, i; + unsigned long i; - start_pfn = page_to_pfn(list_entry(curr, struct page, lru)); + pfn = page_to_pfn(list_entry(curr, struct page, lru)); + for (i = 0; i < (1UL << order); i++) + SetPageNosaveFree(pfn_to_page(pfn + i)); + } - for (i=0; i < (1<<order); i++) - SetPageNosaveFree(pfn_to_page(start_pfn+i)); - } spin_unlock_irqrestore(&zone->lock, flags); } @@ -761,8 +825,8 @@ void split_page(struct page *page, unsigned int order) { int i; - BUG_ON(PageCompound(page)); - BUG_ON(!page_count(page)); + VM_BUG_ON(PageCompound(page)); + VM_BUG_ON(!page_count(page)); for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -809,7 +873,7 @@ again: local_irq_restore(flags); put_cpu(); - BUG_ON(bad_range(zone, page)); + VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) goto again; return page; @@ -870,32 +934,37 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **z = zonelist->zones; struct page *page = NULL; int classzone_idx = zone_idx(*z); + struct zone *zone; /* * Go through the zonelist once, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ do { + zone = *z; + if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && + zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) + break; if ((alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(*z, gfp_mask)) + !cpuset_zone_allowed(zone, gfp_mask)) continue; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; if (alloc_flags & ALLOC_WMARK_MIN) - mark = (*z)->pages_min; + mark = zone->pages_min; else if (alloc_flags & ALLOC_WMARK_LOW) - mark = (*z)->pages_low; + mark = zone->pages_low; else - mark = (*z)->pages_high; - if (!zone_watermark_ok(*z, order, mark, + mark = zone->pages_high; + if (!zone_watermark_ok(zone , order, mark, classzone_idx, alloc_flags)) if (!zone_reclaim_mode || - !zone_reclaim(*z, gfp_mask, order)) + !zone_reclaim(zone, gfp_mask, order)) continue; } - page = buffered_rmqueue(zonelist, *z, order, gfp_mask); + page = buffered_rmqueue(zonelist, zone, order, gfp_mask); if (page) { break; } @@ -1083,7 +1152,7 @@ fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) * get_zeroed_page() returns a 32-bit address, which cannot represent * a highmem page */ - BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask | __GFP_ZERO, 0); if (page) @@ -1116,7 +1185,7 @@ EXPORT_SYMBOL(__free_pages); fastcall void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { - BUG_ON(!virt_addr_valid((void *)addr)); + VM_BUG_ON(!virt_addr_valid((void *)addr)); __free_pages(virt_to_page((void *)addr), order); } } @@ -1142,7 +1211,8 @@ EXPORT_SYMBOL(nr_free_pages); #ifdef CONFIG_NUMA unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) { - unsigned int i, sum = 0; + unsigned int sum = 0; + enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) sum += pgdat->node_zones[i].free_pages; @@ -1187,27 +1257,11 @@ unsigned int nr_free_pagecache_pages(void) return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); } -#ifdef CONFIG_HIGHMEM -unsigned int nr_free_highpages (void) +static inline void show_node(struct zone *zone) { - pg_data_t *pgdat; - unsigned int pages = 0; - - for_each_online_pgdat(pgdat) - pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; - - return pages; + if (NUMA_BUILD) + printk("Node %ld ", zone_to_nid(zone)); } -#endif - -#ifdef CONFIG_NUMA -static void show_node(struct zone *zone) -{ - printk("Node %d ", zone->zone_pgdat->node_id); -} -#else -#define show_node(zone) do { } while (0) -#endif void si_meminfo(struct sysinfo *val) { @@ -1215,13 +1269,8 @@ void si_meminfo(struct sysinfo *val) val->sharedram = 0; val->freeram = nr_free_pages(); val->bufferram = nr_blockdev_pages(); -#ifdef CONFIG_HIGHMEM val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); -#else - val->totalhigh = 0; - val->freehigh = 0; -#endif val->mem_unit = PAGE_SIZE; } @@ -1234,8 +1283,13 @@ void si_meminfo_node(struct sysinfo *val, int nid) val->totalram = pgdat->node_present_pages; val->freeram = nr_free_pages_pgdat(pgdat); +#ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages; +#else + val->totalhigh = 0; + val->freehigh = 0; +#endif val->mem_unit = PAGE_SIZE; } #endif @@ -1249,43 +1303,35 @@ void si_meminfo_node(struct sysinfo *val, int nid) */ void show_free_areas(void) { - int cpu, temperature; + int cpu; unsigned long active; unsigned long inactive; unsigned long free; struct zone *zone; for_each_zone(zone) { - show_node(zone); - printk("%s per-cpu:", zone->name); - - if (!populated_zone(zone)) { - printk(" empty\n"); + if (!populated_zone(zone)) continue; - } else - printk("\n"); + + show_node(zone); + printk("%s per-cpu:\n", zone->name); for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; pageset = zone_pcp(zone, cpu); - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: high %d, batch %d used:%d\n", - cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch, - pageset->pcp[temperature].count); + printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " + "Cold: hi:%5d, btch:%4d usd:%4d\n", + cpu, pageset->pcp[0].high, + pageset->pcp[0].batch, pageset->pcp[0].count, + pageset->pcp[1].high, pageset->pcp[1].batch, + pageset->pcp[1].count); } } get_zone_counts(&active, &inactive, &free); - printk("Free pages: %11ukB (%ukB HighMem)\n", - K(nr_free_pages()), - K(nr_free_highpages())); - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu " "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n", active, @@ -1294,13 +1340,17 @@ void show_free_areas(void) global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), nr_free_pages(), - global_page_state(NR_SLAB), + global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_PAGETABLE)); for_each_zone(zone) { int i; + if (!populated_zone(zone)) + continue; + show_node(zone); printk("%s" " free:%lukB" @@ -1333,12 +1383,11 @@ void show_free_areas(void) for_each_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (!populated_zone(zone)) + continue; + show_node(zone); printk("%s: ", zone->name); - if (!populated_zone(zone)) { - printk("empty\n"); - continue; - } spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { @@ -1360,39 +1409,25 @@ void show_free_areas(void) * Add all populated zones of a node to the zonelist. */ static int __meminit build_zonelists_node(pg_data_t *pgdat, - struct zonelist *zonelist, int nr_zones, int zone_type) + struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) { struct zone *zone; - BUG_ON(zone_type > ZONE_HIGHMEM); + BUG_ON(zone_type >= MAX_NR_ZONES); + zone_type++; do { + zone_type--; zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) { -#ifndef CONFIG_HIGHMEM - BUG_ON(zone_type > ZONE_NORMAL); -#endif zonelist->zones[nr_zones++] = zone; check_highest_zone(zone_type); } - zone_type--; - } while (zone_type >= 0); + } while (zone_type); return nr_zones; } -static inline int highest_zone(int zone_bits) -{ - int res = ZONE_NORMAL; - if (zone_bits & (__force int)__GFP_HIGHMEM) - res = ZONE_HIGHMEM; - if (zone_bits & (__force int)__GFP_DMA32) - res = ZONE_DMA32; - if (zone_bits & (__force int)__GFP_DMA) - res = ZONE_DMA; - return res; -} - #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) static int __meminitdata node_load[MAX_NUMNODES]; @@ -1458,13 +1493,14 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) static void __meminit build_zonelists(pg_data_t *pgdat) { - int i, j, k, node, local_node; + int j, node, local_node; + enum zone_type i; int prev_node, load; struct zonelist *zonelist; nodemask_t used_mask; /* initialize zonelists */ - for (i = 0; i < GFP_ZONETYPES; i++) { + for (i = 0; i < MAX_NR_ZONES; i++) { zonelist = pgdat->node_zonelists + i; zonelist->zones[0] = NULL; } @@ -1494,13 +1530,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) node_load[node] += load; prev_node = node; load--; - for (i = 0; i < GFP_ZONETYPES; i++) { + for (i = 0; i < MAX_NR_ZONES; i++) { zonelist = pgdat->node_zonelists + i; for (j = 0; zonelist->zones[j] != NULL; j++); - k = highest_zone(i); - - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); zonelist->zones[j] = NULL; } } @@ -1510,17 +1544,16 @@ static void __meminit build_zonelists(pg_data_t *pgdat) static void __meminit build_zonelists(pg_data_t *pgdat) { - int i, j, k, node, local_node; + int node, local_node; + enum zone_type i,j; local_node = pgdat->node_id; - for (i = 0; i < GFP_ZONETYPES; i++) { + for (i = 0; i < MAX_NR_ZONES; i++) { struct zonelist *zonelist; zonelist = pgdat->node_zonelists + i; - j = 0; - k = highest_zone(i); - j = build_zonelists_node(pgdat, zonelist, j, k); + j = build_zonelists_node(pgdat, zonelist, 0, i); /* * Now we build the zonelist so that it contains the zones * of all the other nodes. @@ -1532,12 +1565,12 @@ static void __meminit build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); } zonelist->zones[j] = NULL; @@ -1558,7 +1591,7 @@ static int __meminit __build_all_zonelists(void *dummy) void __meminit build_all_zonelists(void) { if (system_state == SYSTEM_BOOTING) { - __build_all_zonelists(0); + __build_all_zonelists(NULL); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guaranntee there is no user @@ -1639,25 +1672,6 @@ static inline unsigned long wait_table_bits(unsigned long size) #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) -static void __init calculate_zone_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) -{ - unsigned long realtotalpages, totalpages = 0; - int i; - - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - pgdat->node_present_pages = realtotalpages; - printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); -} - - /* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is @@ -1698,8 +1712,8 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, } #define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) -void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, - unsigned long size) +void zonetable_add(struct zone *zone, int nid, enum zone_type zid, + unsigned long pfn, unsigned long size) { unsigned long snum = pfn_to_section_nr(pfn); unsigned long end = pfn_to_section_nr(pfn + size); @@ -1815,6 +1829,9 @@ static int __cpuinit process_zones(int cpu) for_each_zone(zone) { + if (!populated_zone(zone)) + continue; + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); if (!zone_pcp(zone, cpu)) @@ -1845,8 +1862,10 @@ static inline void free_zone_pagesets(int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + /* Free per_cpu_pageset if it is slab allocated */ + if (pset != &boot_pageset[cpu]) + kfree(pset); zone_pcp(zone, cpu) = NULL; - kfree(pset); } } @@ -1972,6 +1991,366 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/* + * Basic iterator support. Return the first range of PFNs for a node + * Note: nid == MAX_NUMNODES returns first region regardless of node + */ +static int __init first_active_region_index_in_nid(int nid) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the next active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardles of node + */ +static int __init next_active_region_index_in_nid(int index, int nid) +{ + for (index = index + 1; index < nr_nodemap_entries; index++) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +/* + * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. + * Architectures may implement their own version but if add_active_range() + * was used and there are no special requirements, this is a convenient + * alternative + */ +int __init early_pfn_to_nid(unsigned long pfn) +{ + int i; + + for (i = 0; i < nr_nodemap_entries; i++) { + unsigned long start_pfn = early_node_map[i].start_pfn; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (start_pfn <= pfn && pfn < end_pfn) + return early_node_map[i].nid; + } + + return 0; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +/* Basic iterator support to walk early_node_map[] */ +#define for_each_active_range_index_in_nid(i, nid) \ + for (i = first_active_region_index_in_nid(nid); i != -1; \ + i = next_active_region_index_in_nid(i, nid)) + +/** + * free_bootmem_with_active_regions - Call free_bootmem_node for each active range + * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed + * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling free_bootmem() manually. + */ +void __init free_bootmem_with_active_regions(int nid, + unsigned long max_low_pfn) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) { + unsigned long size_pages = 0; + unsigned long end_pfn = early_node_map[i].end_pfn; + + if (early_node_map[i].start_pfn >= max_low_pfn) + continue; + + if (end_pfn > max_low_pfn) + end_pfn = max_low_pfn; + + size_pages = end_pfn - early_node_map[i].start_pfn; + free_bootmem_node(NODE_DATA(early_node_map[i].nid), + PFN_PHYS(early_node_map[i].start_pfn), + size_pages << PAGE_SHIFT); + } +} + +/** + * sparse_memory_present_with_active_regions - Call memory_present for each active range + * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used + * + * If an architecture guarantees that all ranges registered with + * add_active_ranges() contain no holes and may be freed, this + * this function may be used instead of calling memory_present() manually. + */ +void __init sparse_memory_present_with_active_regions(int nid) +{ + int i; + + for_each_active_range_index_in_nid(i, nid) + memory_present(early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); +} + +/** + * push_node_boundaries - Push node boundaries to at least the requested boundary + * @nid: The nid of the node to push the boundary for + * @start_pfn: The start pfn of the node + * @end_pfn: The end pfn of the node + * + * In reserve-based hot-add, mem_map is allocated that is unused until hotadd + * time. Specifically, on x86_64, SRAT will report ranges that can potentially + * be hotplugged even though no physical memory exists. This function allows + * an arch to push out the node boundaries so mem_map is allocated that can + * be used later. + */ +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) +{ + printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", + nid, start_pfn, end_pfn); + + /* Initialise the boundary for this node if necessary */ + if (node_boundary_end_pfn[nid] == 0) + node_boundary_start_pfn[nid] = -1UL; + + /* Update the boundaries */ + if (node_boundary_start_pfn[nid] > start_pfn) + node_boundary_start_pfn[nid] = start_pfn; + if (node_boundary_end_pfn[nid] < end_pfn) + node_boundary_end_pfn[nid] = end_pfn; +} + +/* If necessary, push the node boundary out for reserve hotadd */ +static void __init account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", + nid, *start_pfn, *end_pfn); + + /* Return if boundary information has not been provided */ + if (node_boundary_end_pfn[nid] == 0) + return; + + /* Check the boundaries and update if necessary */ + if (node_boundary_start_pfn[nid] < *start_pfn) + *start_pfn = node_boundary_start_pfn[nid]; + if (node_boundary_end_pfn[nid] > *end_pfn) + *end_pfn = node_boundary_end_pfn[nid]; +} +#else +void __init push_node_boundaries(unsigned int nid, + unsigned long start_pfn, unsigned long end_pfn) {} + +static void __init account_node_boundary(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) {} +#endif + + +/** + * get_pfn_range_for_nid - Return the start and end page frames for a node + * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned + * @start_pfn: Passed by reference. On return, it will have the node start_pfn + * @end_pfn: Passed by reference. On return, it will have the node end_pfn + * + * It returns the start and end page frame of a node based on information + * provided by an arch calling add_active_range(). If called for a node + * with no available memory, a warning is printed and the start and end + * PFNs will be 0 + */ +void __init get_pfn_range_for_nid(unsigned int nid, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + int i; + *start_pfn = -1UL; + *end_pfn = 0; + + for_each_active_range_index_in_nid(i, nid) { + *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); + *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + } + + if (*start_pfn == -1UL) { + printk(KERN_WARNING "Node %u active with no memory\n", nid); + *start_pfn = 0; + } + + /* Push the node boundaries out if requested */ + account_node_boundary(nid, start_pfn, end_pfn); +} + +/* + * Return the number of pages a zone spans in a node, including holes + * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() + */ +unsigned long __init zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + /* Get the start and end of the node and zone */ + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + + /* Check that this node has pages within the zone's required range */ + if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + return 0; + + /* Move the zone boundaries inside the node if necessary */ + zone_end_pfn = min(zone_end_pfn, node_end_pfn); + zone_start_pfn = max(zone_start_pfn, node_start_pfn); + + /* Return the spanned pages */ + return zone_end_pfn - zone_start_pfn; +} + +/* + * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, + * then all holes in the requested range will be accounted for + */ +unsigned long __init __absent_pages_in_range(int nid, + unsigned long range_start_pfn, + unsigned long range_end_pfn) +{ + int i = 0; + unsigned long prev_end_pfn = 0, hole_pages = 0; + unsigned long start_pfn; + + /* Find the end_pfn of the first active range of pfns in the node */ + i = first_active_region_index_in_nid(nid); + if (i == -1) + return 0; + + /* Account for ranges before physical memory on this node */ + if (early_node_map[i].start_pfn > range_start_pfn) + hole_pages = early_node_map[i].start_pfn - range_start_pfn; + + prev_end_pfn = early_node_map[i].start_pfn; + + /* Find all holes for the zone within the node */ + for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { + + /* No need to continue if prev_end_pfn is outside the zone */ + if (prev_end_pfn >= range_end_pfn) + break; + + /* Make sure the end of the zone is not within the hole */ + start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); + prev_end_pfn = max(prev_end_pfn, range_start_pfn); + + /* Update the hole size cound and move on */ + if (start_pfn > range_start_pfn) { + BUG_ON(prev_end_pfn > start_pfn); + hole_pages += start_pfn - prev_end_pfn; + } + prev_end_pfn = early_node_map[i].end_pfn; + } + + /* Account for ranges past physical memory on this node */ + if (range_end_pfn > prev_end_pfn) + hole_pages = range_end_pfn - + max(range_start_pfn, prev_end_pfn); + + return hole_pages; +} + +/** + * absent_pages_in_range - Return number of page frames in holes within a range + * @start_pfn: The start PFN to start searching for holes + * @end_pfn: The end PFN to stop searching for holes + * + * It returns the number of pages frames in memory holes within a range + */ +unsigned long __init absent_pages_in_range(unsigned long start_pfn, + unsigned long end_pfn) +{ + return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); +} + +/* Return the number of page frames in holes in a zone on a node */ +unsigned long __init zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *ignored) +{ + unsigned long node_start_pfn, node_end_pfn; + unsigned long zone_start_pfn, zone_end_pfn; + + get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], + node_start_pfn); + zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], + node_end_pfn); + + return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); +} + +/* Return the zone index a PFN is in */ +int memmap_zone_idx(struct page *lmem_map) +{ + int i; + unsigned long phys_addr = virt_to_phys(lmem_map); + unsigned long pfn = phys_addr >> PAGE_SHIFT; + + for (i = 0; i < MAX_NR_ZONES; i++) + if (pfn < arch_zone_highest_possible_pfn[i]) + break; + + return i; +} +#else +static inline unsigned long zone_spanned_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zones_size) +{ + return zones_size[zone_type]; +} + +static inline unsigned long zone_absent_pages_in_node(int nid, + unsigned long zone_type, + unsigned long *zholes_size) +{ + if (!zholes_size) + return 0; + + return zholes_size[zone_type]; +} + +static inline int memmap_zone_idx(struct page *lmem_map) +{ + return MAX_NR_ZONES; +} +#endif + +static void __init calculate_node_totalpages(struct pglist_data *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + enum zone_type i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, + zones_size); + pgdat->node_spanned_pages = totalpages; + + realtotalpages = totalpages; + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= + zone_absent_pages_in_node(pgdat->node_id, i, + zholes_size); + pgdat->node_present_pages = realtotalpages; + printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, + realtotalpages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1981,7 +2360,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long j; + enum zone_type j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; @@ -1993,21 +2372,46 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize; + unsigned long size, realsize, memmap_pages; - realsize = size = zones_size[j]; - if (zholes_size) - realsize -= zholes_size[j]; + size = zone_spanned_pages_in_node(nid, j, zones_size); + realsize = size - zone_absent_pages_in_node(nid, j, + zholes_size); - if (j < ZONE_HIGHMEM) + /* + * Adjust realsize so that it accounts for how much memory + * is used by this zone for memmap. This affects the watermark + * and per-cpu initialisations + */ + memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; + if (realsize >= memmap_pages) { + realsize -= memmap_pages; + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); + } else + printk(KERN_WARNING + " %s zone: %lu pages exceeds realsize %lu\n", + zone_names[j], memmap_pages, realsize); + + /* Account for reserved DMA pages */ + if (j == ZONE_DMA && realsize > dma_reserve) { + realsize -= dma_reserve; + printk(KERN_DEBUG " DMA zone: %lu pages reserved\n", + dma_reserve); + } + + if (!is_highmem_idx(j)) nr_kernel_pages += realsize; nr_all_pages += realsize; zone->spanned_pages = size; zone->present_pages = realsize; #ifdef CONFIG_NUMA - zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio) + zone->node = nid; + zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; + zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -2067,8 +2471,13 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) /* * With no DISCONTIG, the global mem_map is just set as node 0's */ - if (pgdat == NODE_DATA(0)) + if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= pgdat->node_start_pfn; +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } @@ -2079,13 +2488,255 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, { pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - calculate_zone_totalpages(pgdat, zones_size, zholes_size); + calculate_node_totalpages(pgdat, zones_size, zholes_size); alloc_node_mem_map(pgdat); free_area_init_core(pgdat, zones_size, zholes_size); } +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +/** + * add_active_range - Register a range of PFNs backed by physical memory + * @nid: The node ID the range resides on + * @start_pfn: The start PFN of the available physical memory + * @end_pfn: The end PFN of the available physical memory + * + * These ranges are stored in an early_node_map[] and later used by + * free_area_init_nodes() to calculate zone sizes and holes. If the + * range spans a memory hole, it is up to the architecture to ensure + * the memory is not freed by the bootmem allocator. If possible + * the range being registered will be merged with existing ranges. + */ +void __init add_active_range(unsigned int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + + printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + /* Merge with existing active regions if possible */ + for (i = 0; i < nr_nodemap_entries; i++) { + if (early_node_map[i].nid != nid) + continue; + + /* Skip if an existing region covers this new one */ + if (start_pfn >= early_node_map[i].start_pfn && + end_pfn <= early_node_map[i].end_pfn) + return; + + /* Merge forward if suitable */ + if (start_pfn <= early_node_map[i].end_pfn && + end_pfn > early_node_map[i].end_pfn) { + early_node_map[i].end_pfn = end_pfn; + return; + } + + /* Merge backward if suitable */ + if (start_pfn < early_node_map[i].end_pfn && + end_pfn >= early_node_map[i].start_pfn) { + early_node_map[i].start_pfn = start_pfn; + return; + } + } + + /* Check that early_node_map is large enough */ + if (i >= MAX_ACTIVE_REGIONS) { + printk(KERN_CRIT "More than %d memory regions, truncating\n", + MAX_ACTIVE_REGIONS); + return; + } + + early_node_map[i].nid = nid; + early_node_map[i].start_pfn = start_pfn; + early_node_map[i].end_pfn = end_pfn; + nr_nodemap_entries = i + 1; +} + +/** + * shrink_active_range - Shrink an existing registered range of PFNs + * @nid: The node id the range is on that should be shrunk + * @old_end_pfn: The old end PFN of the range + * @new_end_pfn: The new PFN of the range + * + * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. + * The map is kept at the end physical page range that has already been + * registered with add_active_range(). This function allows an arch to shrink + * an existing registered range. + */ +void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, + unsigned long new_end_pfn) +{ + int i; + + /* Find the old active region end and shrink */ + for_each_active_range_index_in_nid(i, nid) + if (early_node_map[i].end_pfn == old_end_pfn) { + early_node_map[i].end_pfn = new_end_pfn; + break; + } +} + +/** + * remove_all_active_ranges - Remove all currently registered regions + * During discovery, it may be found that a table like SRAT is invalid + * and an alternative discovery method must be used. This function removes + * all currently registered regions. + */ +void __init remove_all_active_ranges() +{ + memset(early_node_map, 0, sizeof(early_node_map)); + nr_nodemap_entries = 0; +#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE + memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); + memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); +#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ +} + +/* Compare two active node_active_regions */ +static int __init cmp_node_active_region(const void *a, const void *b) +{ + struct node_active_region *arange = (struct node_active_region *)a; + struct node_active_region *brange = (struct node_active_region *)b; + + /* Done this way to avoid overflows */ + if (arange->start_pfn > brange->start_pfn) + return 1; + if (arange->start_pfn < brange->start_pfn) + return -1; + + return 0; +} + +/* sort the node_map by start_pfn */ +static void __init sort_node_map(void) +{ + sort(early_node_map, (size_t)nr_nodemap_entries, + sizeof(struct node_active_region), + cmp_node_active_region, NULL); +} + +/* Find the lowest pfn for a node. This depends on a sorted early_node_map */ +unsigned long __init find_min_pfn_for_node(unsigned long nid) +{ + int i; + + /* Assuming a sorted map, the first range found has the starting pfn */ + for_each_active_range_index_in_nid(i, nid) + return early_node_map[i].start_pfn; + + printk(KERN_WARNING "Could not find start_pfn for node %lu\n", nid); + return 0; +} + +/** + * find_min_pfn_with_active_regions - Find the minimum PFN registered + * + * It returns the minimum PFN based on information provided via + * add_active_range() + */ +unsigned long __init find_min_pfn_with_active_regions(void) +{ + return find_min_pfn_for_node(MAX_NUMNODES); +} + +/** + * find_max_pfn_with_active_regions - Find the maximum PFN registered + * + * It returns the maximum PFN based on information provided via + * add_active_range() + */ +unsigned long __init find_max_pfn_with_active_regions(void) +{ + int i; + unsigned long max_pfn = 0; + + for (i = 0; i < nr_nodemap_entries; i++) + max_pfn = max(max_pfn, early_node_map[i].end_pfn); + + return max_pfn; +} + +/** + * free_area_init_nodes - Initialise all pg_data_t and zone data + * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA + * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32 + * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL + * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM + * + * This will call free_area_init_node() for each active node in the system. + * Using the page ranges provided by add_active_range(), the size of each + * zone in each node and their holes is calculated. If the maximum PFN + * between two adjacent zones match, it is assumed that the zone is empty. + * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed + * that arch_max_dma32_pfn has no pages. It is also assumed that a zone + * starts where the previous one ended. For example, ZONE_DMA32 starts + * at arch_max_dma_pfn. + */ +void __init free_area_init_nodes(unsigned long *max_zone_pfn) +{ + unsigned long nid; + enum zone_type i; + + /* Record where the zone boundaries are */ + memset(arch_zone_lowest_possible_pfn, 0, + sizeof(arch_zone_lowest_possible_pfn)); + memset(arch_zone_highest_possible_pfn, 0, + sizeof(arch_zone_highest_possible_pfn)); + arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); + arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; + for (i = 1; i < MAX_NR_ZONES; i++) { + arch_zone_lowest_possible_pfn[i] = + arch_zone_highest_possible_pfn[i-1]; + arch_zone_highest_possible_pfn[i] = + max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + } + + /* Regions in the early_node_map can be in any order */ + sort_node_map(); + + /* Print out the zone ranges */ + printk("Zone PFN ranges:\n"); + for (i = 0; i < MAX_NR_ZONES; i++) + printk(" %-8s %8lu -> %8lu\n", + zone_names[i], + arch_zone_lowest_possible_pfn[i], + arch_zone_highest_possible_pfn[i]); + + /* Print out the early_node_map[] */ + printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); + for (i = 0; i < nr_nodemap_entries; i++) + printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, + early_node_map[i].start_pfn, + early_node_map[i].end_pfn); + + /* Initialise every node */ + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + free_area_init_node(nid, pgdat, NULL, + find_min_pfn_for_node(nid), NULL); + } +} +#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ + +/** + * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA + * @new_dma_reserve - The number of pages to mark reserved + * + * The per-cpu batchsize and zone watermarks are determined by present_pages. + * In the DMA zone, a significant percentage may be consumed by kernel image + * and other unfreeable allocations which can skew the watermarks badly. This + * function may optionally be used to account for unfreeable pages in + * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize + */ +void __init set_dma_reserve(unsigned long new_dma_reserve) +{ + dma_reserve = new_dma_reserve; +} + #ifndef CONFIG_NEED_MULTIPLE_NODES static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; @@ -2129,7 +2780,7 @@ static void calculate_totalreserve_pages(void) { struct pglist_data *pgdat; unsigned long reserve_pages = 0; - int i, j; + enum zone_type i, j; for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { @@ -2162,7 +2813,7 @@ static void calculate_totalreserve_pages(void) static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; - int j, idx; + enum zone_type j, idx; for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { @@ -2171,9 +2822,12 @@ static void setup_per_zone_lowmem_reserve(void) zone->lowmem_reserve[j] = 0; - for (idx = j-1; idx >= 0; idx--) { + idx = j; + while (idx) { struct zone *lower_zone; + idx--; + if (sysctl_lowmem_reserve_ratio[idx] < 1) sysctl_lowmem_reserve_ratio[idx] = 1; @@ -2314,10 +2968,26 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, return rc; for_each_zone(zone) - zone->min_unmapped_ratio = (zone->present_pages * + zone->min_unmapped_pages = (zone->present_pages * sysctl_min_unmapped_ratio) / 100; return 0; } + +int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_slab_pages = (zone->present_pages * + sysctl_min_slab_ratio) / 100; + return 0; +} #endif /* @@ -2363,7 +3033,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, return 0; } -__initdata int hashdist = HASHDIST_DEFAULT; +int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA static int __init set_hashdist(char *str) diff --git a/mm/page_io.c b/mm/page_io.c index 88029948d00a..d4840ecbf8f9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -52,14 +52,29 @@ static int end_swap_bio_write(struct bio *bio, unsigned int bytes_done, int err) if (bio->bi_size) return 1; - if (!uptodate) + if (!uptodate) { SetPageError(page); + /* + * We failed to write the page out to swap-space. + * Re-dirty the page in order to avoid it being reclaimed. + * Also print a dire warning that things will go BAD (tm) + * very quickly. + * + * Also clear PG_reclaim to avoid rotate_reclaimable_page() + */ + set_page_dirty(page); + printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_sector); + ClearPageReclaim(page); + } end_page_writeback(page); bio_put(bio); return 0; } -static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) +int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) { const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; @@ -70,6 +85,10 @@ static int end_swap_bio_read(struct bio *bio, unsigned int bytes_done, int err) if (!uptodate) { SetPageError(page); ClearPageUptodate(page); + printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_sector); } else { SetPageUptodate(page); } @@ -137,10 +156,12 @@ out: * We use end_swap_bio_read() even for writes, because it happens to do what * we want. */ -int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) +int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page, + struct bio **bio_chain) { struct bio *bio; int ret = 0; + int bio_rw; lock_page(page); @@ -151,11 +172,22 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) goto out; } - submit_bio(rw | (1 << BIO_RW_SYNC), bio); - wait_on_page_locked(page); - - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; + bio_rw = rw; + if (!bio_chain) + bio_rw |= (1 << BIO_RW_SYNC); + if (bio_chain) + bio_get(bio); + submit_bio(bio_rw, bio); + if (bio_chain == NULL) { + wait_on_page_locked(page); + + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + } + if (bio_chain) { + bio->bi_private = *bio_chain; + *bio_chain = bio; + } out: return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 40158b59729e..e2155d791d99 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -434,6 +434,71 @@ int page_referenced(struct page *page, int is_locked) return referenced; } +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte, entry; + spinlock_t *ptl; + int ret = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + goto out; + + if (!pte_dirty(*pte) && !pte_write(*pte)) + goto unlock; + + entry = ptep_get_and_clear(mm, address, pte); + entry = pte_mkclean(entry); + entry = pte_wrprotect(entry); + ptep_establish(vma, address, pte, entry); + lazy_mmu_prot_update(entry); + ret = 1; + +unlock: + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +static int page_mkclean_file(struct address_space *mapping, struct page *page) +{ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = 0; + + BUG_ON(PageAnon(page)); + + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_flags & VM_SHARED) + ret += page_mkclean_one(page, vma); + } + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int page_mkclean(struct page *page) +{ + int ret = 0; + + BUG_ON(!PageLocked(page)); + + if (page_mapped(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + ret = page_mkclean_file(mapping, page); + } + + return ret; +} + /** * page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to diff --git a/mm/shmem.c b/mm/shmem.c index db21c51531ca..bb8ca7ef7094 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -26,6 +26,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/xattr.h> +#include <linux/generic_acl.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/file.h> @@ -45,6 +47,7 @@ #include <linux/namei.h> #include <linux/ctype.h> #include <linux/migrate.h> +#include <linux/highmem.h> #include <asm/uaccess.h> #include <asm/div64.h> @@ -176,6 +179,7 @@ static const struct address_space_operations shmem_aops; static struct file_operations shmem_file_operations; static struct inode_operations shmem_inode_operations; static struct inode_operations shmem_dir_inode_operations; +static struct inode_operations shmem_special_inode_operations; static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { @@ -636,7 +640,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) struct page *page = NULL; int error; - if (attr->ia_valid & ATTR_SIZE) { + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { if (attr->ia_size < inode->i_size) { /* * If truncating down to a partial page, then @@ -669,6 +673,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) error = inode_change_ok(inode, attr); if (!error) error = inode_setattr(inode, attr); +#ifdef CONFIG_TMPFS_POSIX_ACL + if (!error && (attr->ia_valid & ATTR_MODE)) + error = generic_acl_chmod(inode, &shmem_acl_ops); +#endif if (page) page_cache_release(page); return error; @@ -1350,7 +1358,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; - inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; @@ -1362,6 +1369,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) switch (mode & S_IFMT) { default: + inode->i_op = &shmem_special_inode_operations; init_special_inode(inode, mode, dev); break; case S_IFREG: @@ -1371,7 +1379,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) &sbinfo->policy_nodes); break; case S_IFDIR: - inode->i_nlink++; + inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * BOGO_DIRENT_SIZE; inode->i_op = &shmem_dir_inode_operations; @@ -1682,7 +1690,11 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) iput(inode); return error; } - error = 0; + } + error = shmem_acl_init(inode, dir); + if (error) { + iput(inode); + return error; } if (dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -1703,7 +1715,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode) if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) return error; - dir->i_nlink++; + inc_nlink(dir); return 0; } @@ -1738,7 +1750,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - inode->i_nlink++; + inc_nlink(inode); atomic_inc(&inode->i_count); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); @@ -1760,7 +1772,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; - inode->i_nlink--; + drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; } @@ -1770,8 +1782,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - dentry->d_inode->i_nlink--; - dir->i_nlink--; + drop_nlink(dentry->d_inode); + drop_nlink(dir); return shmem_unlink(dir, dentry); } @@ -1792,10 +1804,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) - old_dir->i_nlink--; + drop_nlink(old_dir); } else if (they_are_dirs) { - old_dir->i_nlink--; - new_dir->i_nlink++; + drop_nlink(old_dir); + inc_nlink(new_dir); } old_dir->i_size -= BOGO_DIRENT_SIZE; @@ -1897,6 +1909,53 @@ static struct inode_operations shmem_symlink_inode_operations = { .put_link = shmem_put_link, }; +#ifdef CONFIG_TMPFS_POSIX_ACL +/** + * Superblocks without xattr inode operations will get security.* xattr + * support from the VFS "for free". As soon as we have any other xattrs + * like ACLs, we also need to implement the security.* handlers at + * filesystem level, though. + */ + +static size_t shmem_xattr_security_list(struct inode *inode, char *list, + size_t list_len, const char *name, + size_t name_len) +{ + return security_inode_listsecurity(inode, list, list_len); +} + +static int shmem_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return security_inode_getsecurity(inode, name, buffer, size, + -EOPNOTSUPP); +} + +static int shmem_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return security_inode_setsecurity(inode, name, value, size, flags); +} + +struct xattr_handler shmem_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = shmem_xattr_security_list, + .get = shmem_xattr_security_get, + .set = shmem_xattr_security_set, +}; + +static struct xattr_handler *shmem_xattr_handlers[] = { + &shmem_xattr_acl_access_handler, + &shmem_xattr_acl_default_handler, + &shmem_xattr_security_handler, + NULL +}; +#endif + static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes, int *policy, nodemask_t *policy_nodes) @@ -2094,6 +2153,10 @@ static int shmem_fill_super(struct super_block *sb, sb->s_magic = TMPFS_MAGIC; sb->s_op = &shmem_ops; sb->s_time_gran = 1; +#ifdef CONFIG_TMPFS_POSIX_ACL + sb->s_xattr = shmem_xattr_handlers; + sb->s_flags |= MS_POSIXACL; +#endif inode = shmem_get_inode(sb, S_IFDIR | mode, 0); if (!inode) @@ -2130,6 +2193,7 @@ static void shmem_destroy_inode(struct inode *inode) /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } + shmem_acl_destroy_inode(inode); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -2141,6 +2205,10 @@ static void init_once(void *foo, struct kmem_cache *cachep, if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR) { inode_init_once(&p->vfs_inode); +#ifdef CONFIG_TMPFS_POSIX_ACL + p->i_acl = NULL; + p->i_default_acl = NULL; +#endif } } @@ -2156,8 +2224,7 @@ static int init_inodecache(void) static void destroy_inodecache(void) { - if (kmem_cache_destroy(shmem_inode_cachep)) - printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n"); + kmem_cache_destroy(shmem_inode_cachep); } static const struct address_space_operations shmem_aops = { @@ -2185,6 +2252,14 @@ static struct inode_operations shmem_inode_operations = { .truncate = shmem_truncate, .setattr = shmem_notify_change, .truncate_range = shmem_truncate_range, +#ifdef CONFIG_TMPFS_POSIX_ACL + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, + .permission = shmem_permission, +#endif + }; static struct inode_operations shmem_dir_inode_operations = { @@ -2199,6 +2274,25 @@ static struct inode_operations shmem_dir_inode_operations = { .mknod = shmem_mknod, .rename = shmem_rename, #endif +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_notify_change, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, + .permission = shmem_permission, +#endif +}; + +static struct inode_operations shmem_special_inode_operations = { +#ifdef CONFIG_TMPFS_POSIX_ACL + .setattr = shmem_notify_change, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = generic_listxattr, + .removexattr = generic_removexattr, + .permission = shmem_permission, +#endif }; static struct super_operations shmem_ops = { diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c new file mode 100644 index 000000000000..c946bf468718 --- /dev/null +++ b/mm/shmem_acl.c @@ -0,0 +1,197 @@ +/* + * mm/shmem_acl.c + * + * (C) 2005 Andreas Gruenbacher <agruen@suse.de> + * + * This file is released under the GPL. + */ + +#include <linux/fs.h> +#include <linux/shmem_fs.h> +#include <linux/xattr.h> +#include <linux/generic_acl.h> + +/** + * shmem_get_acl - generic_acl_operations->getacl() operation + */ +static struct posix_acl * +shmem_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl = NULL; + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + acl = posix_acl_dup(SHMEM_I(inode)->i_acl); + break; + + case ACL_TYPE_DEFAULT: + acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); + break; + } + spin_unlock(&inode->i_lock); + + return acl; +} + +/** + * shmem_get_acl - generic_acl_operations->setacl() operation + */ +static void +shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + struct posix_acl *free = NULL; + + spin_lock(&inode->i_lock); + switch(type) { + case ACL_TYPE_ACCESS: + free = SHMEM_I(inode)->i_acl; + SHMEM_I(inode)->i_acl = posix_acl_dup(acl); + break; + + case ACL_TYPE_DEFAULT: + free = SHMEM_I(inode)->i_default_acl; + SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); + break; + } + spin_unlock(&inode->i_lock); + posix_acl_release(free); +} + +struct generic_acl_operations shmem_acl_ops = { + .getacl = shmem_get_acl, + .setacl = shmem_set_acl, +}; + +/** + * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, + * shmem_xattr_acl_access_handler - plumbing code to implement the + * system.posix_acl_access xattr using the generic acl functions. + */ + +static size_t +shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, + list, list_size); +} + +static int +shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, + size); +} + +static int +shmem_set_acl_access(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, + size); +} + +struct xattr_handler shmem_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = shmem_list_acl_access, + .get = shmem_get_acl_access, + .set = shmem_set_acl_access, +}; + +/** + * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, + * shmem_xattr_acl_default_handler - plumbing code to implement the + * system.posix_acl_default xattr using the generic acl functions. + */ + +static size_t +shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, + list, list_size); +} + +static int +shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, + size); +} + +static int +shmem_set_acl_default(struct inode *inode, const char *name, const void *value, + size_t size, int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, + size); +} + +struct xattr_handler shmem_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = shmem_list_acl_default, + .get = shmem_get_acl_default, + .set = shmem_set_acl_default, +}; + +/** + * shmem_acl_init - Inizialize the acl(s) of a new inode + */ +int +shmem_acl_init(struct inode *inode, struct inode *dir) +{ + return generic_acl_init(inode, dir, &shmem_acl_ops); +} + +/** + * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode + * + * This is done before destroying the actual inode. + */ + +void +shmem_acl_destroy_inode(struct inode *inode) +{ + if (SHMEM_I(inode)->i_acl) + posix_acl_release(SHMEM_I(inode)->i_acl); + SHMEM_I(inode)->i_acl = NULL; + if (SHMEM_I(inode)->i_default_acl) + posix_acl_release(SHMEM_I(inode)->i_default_acl); + SHMEM_I(inode)->i_default_acl = NULL; +} + +/** + * shmem_check_acl - check_acl() callback for generic_permission() + */ +static int +shmem_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); + + if (acl) { + int error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return error; + } + return -EAGAIN; +} + +/** + * shmem_permission - permission() inode operation + */ +int +shmem_permission(struct inode *inode, int mask, struct nameidata *nd) +{ + return generic_permission(inode, mask, shmem_check_acl); +} diff --git a/mm/slab.c b/mm/slab.c index 21ba06035700..3dbd6f4e7477 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -313,7 +313,7 @@ static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); -static void enable_cpucache(struct kmem_cache *cachep); +static int enable_cpucache(struct kmem_cache *cachep); static void cache_reap(void *unused); /* @@ -674,6 +674,8 @@ static struct kmem_cache cache_cache = { #endif }; +#define BAD_ALIEN_MAGIC 0x01020304ul + #ifdef CONFIG_LOCKDEP /* @@ -682,42 +684,58 @@ static struct kmem_cache cache_cache = { * The locking for this is tricky in that it nests within the locks * of all other slabs in a few places; to deal with this special * locking we put on-slab caches into a separate lock-class. + * + * We set lock class for alien array caches which are up during init. + * The lock annotation will be lost if all cpus of a node goes down and + * then comes back up during hotplug */ -static struct lock_class_key on_slab_key; +static struct lock_class_key on_slab_l3_key; +static struct lock_class_key on_slab_alc_key; + +static inline void init_lock_keys(void) -static inline void init_lock_keys(struct cache_sizes *s) { int q; - - for (q = 0; q < MAX_NUMNODES; q++) { - if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) - continue; - lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, - &on_slab_key); + struct cache_sizes *s = malloc_sizes; + + while (s->cs_size != ULONG_MAX) { + for_each_node(q) { + struct array_cache **alc; + int r; + struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; + if (!l3 || OFF_SLAB(s->cs_cachep)) + continue; + lockdep_set_class(&l3->list_lock, &on_slab_l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + continue; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, + &on_slab_alc_key); + } + } + s++; } } - #else -static inline void init_lock_keys(struct cache_sizes *s) +static inline void init_lock_keys(void) { } #endif - - /* Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; /* - * vm_enough_memory() looks at this to determine how many slab-allocated pages - * are possibly freeable under pressure - * - * SLAB_RECLAIM_ACCOUNT turns this on per-slab - */ -atomic_t slab_reclaim_pages; - -/* * chicken and egg problem: delay the per-cpu array allocation * until the general caches are up. */ @@ -768,11 +786,10 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, return csizep->cs_cachep; } -struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) +static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) { return __find_general_cachep(size, gfpflags); } -EXPORT_SYMBOL(kmem_find_general_cachep); static size_t slab_mgmt_size(size_t nr_objs, size_t align) { @@ -955,7 +972,39 @@ static int transfer_objects(struct array_cache *to, return nr; } -#ifdef CONFIG_NUMA +#ifndef CONFIG_NUMA + +#define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, l3) do { } while (0) + +static inline struct array_cache **alloc_alien_cache(int node, int limit) +{ + return (struct array_cache **)BAD_ALIEN_MAGIC; +} + +static inline void free_alien_cache(struct array_cache **ac_ptr) +{ +} + +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) +{ + return 0; +} + +static inline void *alternate_node_alloc(struct kmem_cache *cachep, + gfp_t flags) +{ + return NULL; +} + +static inline void *__cache_alloc_node(struct kmem_cache *cachep, + gfp_t flags, int nodeid) +{ + return NULL; +} + +#else /* CONFIG_NUMA */ + static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); @@ -1084,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } return 1; } - -#else - -#define drain_alien_cache(cachep, alien) do { } while (0) -#define reap_alien(cachep, l3) do { } while (0) - -static inline struct array_cache **alloc_alien_cache(int node, int limit) -{ - return (struct array_cache **) 0x01020304ul; -} - -static inline void free_alien_cache(struct array_cache **ac_ptr) -{ -} - -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) -{ - return 0; -} - #endif static int __cpuinit cpuup_callback(struct notifier_block *nfb, @@ -1422,7 +1451,6 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL, NULL); } - init_lock_keys(sizes); sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, @@ -1491,10 +1519,15 @@ void __init kmem_cache_init(void) struct kmem_cache *cachep; mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) - enable_cpucache(cachep); + if (enable_cpucache(cachep)) + BUG(); mutex_unlock(&cache_chain_mutex); } + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + + /* Done! */ g_cpucache_up = FULL; @@ -1543,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) */ flags |= __GFP_COMP; #endif - flags |= cachep->gfpflags; + + /* + * Under NUMA we want memory on the indicated node. We will handle + * the needed fallback ourselves since we want to serve from our + * per node object lists first for other nodes. + */ + flags |= cachep->gfpflags | GFP_THISNODE; page = alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) @@ -1551,8 +1590,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - atomic_add(nr_pages, &slab_reclaim_pages); - add_zone_page_state(page_zone(page), NR_SLAB, nr_pages); + add_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_pages); + else + add_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_pages); for (i = 0; i < nr_pages; i++) __SetPageSlab(page + i); return page_address(page); @@ -1567,7 +1609,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) struct page *page = virt_to_page(addr); const unsigned long nr_freed = i; - sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed); + if (cachep->flags & SLAB_RECLAIM_ACCOUNT) + sub_zone_page_state(page_zone(page), + NR_SLAB_RECLAIMABLE, nr_freed); + else + sub_zone_page_state(page_zone(page), + NR_SLAB_UNRECLAIMABLE, nr_freed); while (i--) { BUG_ON(!PageSlab(page)); __ClearPageSlab(page); @@ -1576,8 +1623,6 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; free_pages((unsigned long)addr, cachep->gfporder); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); } static void kmem_rcu_free(struct rcu_head *head) @@ -1638,10 +1683,32 @@ static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) static void dump_line(char *data, int offset, int limit) { int i; + unsigned char error = 0; + int bad_count = 0; + printk(KERN_ERR "%03x:", offset); - for (i = 0; i < limit; i++) + for (i = 0; i < limit; i++) { + if (data[offset + i] != POISON_FREE) { + error = data[offset + i]; + bad_count++; + } printk(" %02x", (unsigned char)data[offset + i]); + } printk("\n"); + + if (bad_count == 1) { + error ^= POISON_FREE; + if (!(error & (error - 1))) { + printk(KERN_ERR "Single bit error detected. Probably " + "bad RAM.\n"); +#ifdef CONFIG_X86 + printk(KERN_ERR "Run memtest86+ or a similar memory " + "test tool.\n"); +#else + printk(KERN_ERR "Run a memory test tool.\n"); +#endif + } + } } #endif @@ -1834,6 +1901,27 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) } } +static void __kmem_cache_destroy(struct kmem_cache *cachep) +{ + int i; + struct kmem_list3 *l3; + + for_each_online_cpu(i) + kfree(cachep->array[i]); + + /* NUMA: free the list3 structures */ + for_each_online_node(i) { + l3 = cachep->nodelists[i]; + if (l3) { + kfree(l3->shared); + free_alien_cache(l3->alien); + kfree(l3); + } + } + kmem_cache_free(&cache_cache, cachep); +} + + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -1904,12 +1992,11 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, return left_over; } -static void setup_cpu_cache(struct kmem_cache *cachep) +static int setup_cpu_cache(struct kmem_cache *cachep) { - if (g_cpucache_up == FULL) { - enable_cpucache(cachep); - return; - } + if (g_cpucache_up == FULL) + return enable_cpucache(cachep); + if (g_cpucache_up == NONE) { /* * Note: the first kmem_cache_create must create the cache @@ -1956,6 +2043,7 @@ static void setup_cpu_cache(struct kmem_cache *cachep) cpu_cache_get(cachep)->touched = 0; cachep->batchcount = 1; cachep->limit = BOOT_CPUCACHE_ENTRIES; + return 0; } /** @@ -2097,6 +2185,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, } else { ralign = BYTES_PER_WORD; } + + /* + * Redzoning and user store require word alignment. Note this will be + * overridden by architecture or caller mandated alignment if either + * is greater than BYTES_PER_WORD. + */ + if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) + ralign = BYTES_PER_WORD; + /* 2) arch mandated alignment: disables debug if necessary */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; @@ -2110,8 +2207,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); } /* - * 4) Store it. Note that the debug code below can reduce - * the alignment to BYTES_PER_WORD. + * 4) Store it. */ align = ralign; @@ -2123,20 +2219,19 @@ kmem_cache_create (const char *name, size_t size, size_t align, #if DEBUG cachep->obj_size = size; + /* + * Both debugging options require word-alignment which is calculated + * into align above. + */ if (flags & SLAB_RED_ZONE) { - /* redzoning only works with word aligned caches */ - align = BYTES_PER_WORD; - /* add space for red zone words */ cachep->obj_offset += BYTES_PER_WORD; size += 2 * BYTES_PER_WORD; } if (flags & SLAB_STORE_USER) { - /* user store requires word alignment and - * one word storage behind the end of the real - * object. + /* user store requires one word storage behind the end of + * the real object. */ - align = BYTES_PER_WORD; size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) @@ -2200,14 +2295,26 @@ kmem_cache_create (const char *name, size_t size, size_t align, cachep->gfpflags |= GFP_DMA; cachep->buffer_size = size; - if (flags & CFLGS_OFF_SLAB) + if (flags & CFLGS_OFF_SLAB) { cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); + /* + * This is a possibility for one of the malloc_sizes caches. + * But since we go off slab only for object size greater than + * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, + * this should not happen at all. + * But leave a BUG_ON for some lucky dude. + */ + BUG_ON(!cachep->slabp_cache); + } cachep->ctor = ctor; cachep->dtor = dtor; cachep->name = name; - - setup_cpu_cache(cachep); + if (setup_cpu_cache(cachep)) { + __kmem_cache_destroy(cachep); + cachep = NULL; + goto oops; + } /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); @@ -2375,7 +2482,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); * @cachep: the cache to destroy * * Remove a struct kmem_cache object from the slab cache. - * Returns 0 on success. * * It is expected this function will be called by a module when it is * unloaded. This will remove the cache completely, and avoid a duplicate @@ -2387,11 +2493,8 @@ EXPORT_SYMBOL(kmem_cache_shrink); * The caller must guarantee that noone will allocate memory from the cache * during the kmem_cache_destroy(). */ -int kmem_cache_destroy(struct kmem_cache *cachep) +void kmem_cache_destroy(struct kmem_cache *cachep) { - int i; - struct kmem_list3 *l3; - BUG_ON(!cachep || in_interrupt()); /* Don't let CPUs to come and go */ @@ -2411,31 +2514,28 @@ int kmem_cache_destroy(struct kmem_cache *cachep) list_add(&cachep->next, &cache_chain); mutex_unlock(&cache_chain_mutex); unlock_cpu_hotplug(); - return 1; + return; } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) synchronize_rcu(); - for_each_online_cpu(i) - kfree(cachep->array[i]); - - /* NUMA: free the list3 structures */ - for_each_online_node(i) { - l3 = cachep->nodelists[i]; - if (l3) { - kfree(l3->shared); - free_alien_cache(l3->alien); - kfree(l3); - } - } - kmem_cache_free(&cache_cache, cachep); + __kmem_cache_destroy(cachep); unlock_cpu_hotplug(); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); -/* Get the memory for a slab management obj. */ +/* + * Get the memory for a slab management obj. + * For a slab cache when the slab descriptor is off-slab, slab descriptors + * always come from malloc_sizes caches. The slab descriptor cannot + * come from the same cache which is getting created because, + * when we are searching for an appropriate cache for these + * descriptors in kmem_cache_create, we search through the malloc_sizes array. + * If we are creating a malloc_sizes cache here it would not be visible to + * kmem_find_general_cachep till the initialization is complete. + * Hence we cannot have slabp_cache same as the original cache. + */ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, int colour_off, gfp_t local_flags, int nodeid) @@ -2968,14 +3068,6 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) void *objp; struct array_cache *ac; -#ifdef CONFIG_NUMA - if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { - objp = alternate_node_alloc(cachep, flags); - if (objp != NULL) - return objp; - } -#endif - check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { @@ -2993,12 +3085,24 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) { unsigned long save_flags; - void *objp; + void *objp = NULL; cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - objp = ____cache_alloc(cachep, flags); + + if (unlikely(NUMA_BUILD && + current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) + objp = alternate_node_alloc(cachep, flags); + + if (!objp) + objp = ____cache_alloc(cachep, flags); + /* + * We may just have run out of memory on the local node. + * __cache_alloc_node() knows how to locate memory on other nodes + */ + if (NUMA_BUILD && !objp) + objp = __cache_alloc_node(cachep, flags, numa_node_id()); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); @@ -3017,7 +3121,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) { int nid_alloc, nid_here; - if (in_interrupt()) + if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_node_id(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) @@ -3030,6 +3134,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) } /* + * Fallback function if there was no memory available and no objects on a + * certain node and we are allowed to fall back. We mimick the behavior of + * the page allocator. We fall back according to a zonelist determined by + * the policy layer while obeying cpuset constraints. + */ +void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) +{ + struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + struct zone **z; + void *obj = NULL; + + for (z = zonelist->zones; *z && !obj; z++) + if (zone_idx(*z) <= ZONE_NORMAL && + cpuset_zone_allowed(*z, flags)) + obj = __cache_alloc_node(cache, + flags | __GFP_THISNODE, + zone_to_nid(*z)); + return obj; +} + +/* * A interface to enable slab creation on nodeid */ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, @@ -3082,11 +3208,15 @@ retry: must_grow: spin_unlock(&l3->list_lock); x = cache_grow(cachep, flags, nodeid); + if (x) + goto retry; - if (!x) - return NULL; + if (!(flags & __GFP_THISNODE)) + /* Unable to grow the cache. Fall back to other nodes. */ + return fallback_alloc(cachep, flags); + + return NULL; - goto retry; done: return obj; } @@ -3119,6 +3249,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (slabp->inuse == 0) { if (l3->free_objects > l3->free_limit) { l3->free_objects -= cachep->num; + /* No need to drop any previously held + * lock here, even if we have a off-slab slab + * descriptor it is guaranteed to come from + * a different cache, refer to comments before + * alloc_slabmgmt. + */ slab_destroy(cachep, slabp); } else { list_add(&slabp->list, &l3->slabs_free); @@ -3317,7 +3453,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) } EXPORT_SYMBOL(kmem_cache_alloc_node); -void *kmalloc_node(size_t size, gfp_t flags, int node) +void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *cachep; @@ -3326,7 +3462,7 @@ void *kmalloc_node(size_t size, gfp_t flags, int node) return NULL; return kmem_cache_alloc_node(cachep, flags, node); } -EXPORT_SYMBOL(kmalloc_node); +EXPORT_SYMBOL(__kmalloc_node); #endif /** @@ -3370,55 +3506,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) EXPORT_SYMBOL(__kmalloc_track_caller); #endif -#ifdef CONFIG_SMP -/** - * __alloc_percpu - allocate one copy of the object for every present - * cpu in the system, zeroing them. - * Objects should be dereferenced using the per_cpu_ptr macro only. - * - * @size: how many bytes of memory are required. - */ -void *__alloc_percpu(size_t size) -{ - int i; - struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); - - if (!pdata) - return NULL; - - /* - * Cannot use for_each_online_cpu since a cpu may come online - * and we have no way of figuring out how to fix the array - * that we have allocated then.... - */ - for_each_possible_cpu(i) { - int node = cpu_to_node(i); - - if (node_online(node)) - pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node); - else - pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); - - if (!pdata->ptrs[i]) - goto unwind_oom; - memset(pdata->ptrs[i], 0, size); - } - - /* Catch derefs w/o wrappers */ - return (void *)(~(unsigned long)pdata); - -unwind_oom: - while (--i >= 0) { - if (!cpu_possible(i)) - continue; - kfree(pdata->ptrs[i]); - } - kfree(pdata); - return NULL; -} -EXPORT_SYMBOL(__alloc_percpu); -#endif - /** * kmem_cache_free - Deallocate an object * @cachep: The cache the allocation was from. @@ -3464,29 +3551,6 @@ void kfree(const void *objp) } EXPORT_SYMBOL(kfree); -#ifdef CONFIG_SMP -/** - * free_percpu - free previously allocated percpu memory - * @objp: pointer returned by alloc_percpu. - * - * Don't free memory not originally allocated by alloc_percpu() - * The complemented objp is to check for that. - */ -void free_percpu(const void *objp) -{ - int i; - struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); - - /* - * We allocate for all cpus so we cannot use for online cpu here. - */ - for_each_possible_cpu(i) - kfree(p->ptrs[i]); - kfree(p); -} -EXPORT_SYMBOL(free_percpu); -#endif - unsigned int kmem_cache_size(struct kmem_cache *cachep) { return obj_size(cachep); @@ -3603,22 +3667,26 @@ static void do_ccupdate_local(void *info) static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, int shared) { - struct ccupdate_struct new; - int i, err; + struct ccupdate_struct *new; + int i; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; - memset(&new.new, 0, sizeof(new.new)); for_each_online_cpu(i) { - new.new[i] = alloc_arraycache(cpu_to_node(i), limit, + new->new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); - if (!new.new[i]) { + if (!new->new[i]) { for (i--; i >= 0; i--) - kfree(new.new[i]); + kfree(new->new[i]); + kfree(new); return -ENOMEM; } } - new.cachep = cachep; + new->cachep = cachep; - on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); + on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); check_irq_on(); cachep->batchcount = batchcount; @@ -3626,7 +3694,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; for_each_online_cpu(i) { - struct array_cache *ccold = new.new[i]; + struct array_cache *ccold = new->new[i]; if (!ccold) continue; spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); @@ -3634,18 +3702,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); kfree(ccold); } - - err = alloc_kmemlist(cachep); - if (err) { - printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", - cachep->name, -err); - BUG(); - } - return 0; + kfree(new); + return alloc_kmemlist(cachep); } /* Called with cache_chain_mutex held always */ -static void enable_cpucache(struct kmem_cache *cachep) +static int enable_cpucache(struct kmem_cache *cachep) { int err; int limit, shared; @@ -3697,6 +3759,7 @@ static void enable_cpucache(struct kmem_cache *cachep) if (err) printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", cachep->name, -err); + return err; } /* @@ -4157,6 +4220,7 @@ static int leaks_show(struct seq_file *m, void *p) show_symbol(m, n[2*i+2]); seq_putc(m, '\n'); } + return 0; } diff --git a/mm/slob.c b/mm/slob.c index 7b52b20b9607..542394184a58 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -270,10 +270,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, } EXPORT_SYMBOL(kmem_cache_create); -int kmem_cache_destroy(struct kmem_cache *c) +void kmem_cache_destroy(struct kmem_cache *c) { slob_free(c, sizeof(struct kmem_cache)); - return 0; } EXPORT_SYMBOL(kmem_cache_destroy); @@ -339,52 +338,3 @@ void kmem_cache_init(void) mod_timer(&slob_timer, jiffies + HZ); } - -atomic_t slab_reclaim_pages = ATOMIC_INIT(0); -EXPORT_SYMBOL(slab_reclaim_pages); - -#ifdef CONFIG_SMP - -void *__alloc_percpu(size_t size) -{ - int i; - struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); - - if (!pdata) - return NULL; - - for_each_possible_cpu(i) { - pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); - if (!pdata->ptrs[i]) - goto unwind_oom; - memset(pdata->ptrs[i], 0, size); - } - - /* Catch derefs w/o wrappers */ - return (void *) (~(unsigned long) pdata); - -unwind_oom: - while (--i >= 0) { - if (!cpu_possible(i)) - continue; - kfree(pdata->ptrs[i]); - } - kfree(pdata); - return NULL; -} -EXPORT_SYMBOL(__alloc_percpu); - -void -free_percpu(const void *objp) -{ - int i; - struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); - - for_each_possible_cpu(i) - kfree(p->ptrs[i]); - - kfree(p); -} -EXPORT_SYMBOL(free_percpu); - -#endif diff --git a/mm/swap.c b/mm/swap.c index 687686a61f7c..2e0e871f542f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -34,6 +34,25 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; +/* + * This path almost never happens for VM activity - pages are normally + * freed via pagevecs. But it gets used by networking. + */ +static void fastcall __page_cache_release(struct page *page) +{ + if (PageLRU(page)) { + unsigned long flags; + struct zone *zone = page_zone(page); + + spin_lock_irqsave(&zone->lru_lock, flags); + VM_BUG_ON(!PageLRU(page)); + __ClearPageLRU(page); + del_page_from_lru(zone, page); + spin_unlock_irqrestore(&zone->lru_lock, flags); + } + free_hot_page(page); +} + static void put_compound_page(struct page *page) { page = (struct page *)page_private(page); @@ -223,26 +242,6 @@ int lru_add_drain_all(void) #endif /* - * This path almost never happens for VM activity - pages are normally - * freed via pagevecs. But it gets used by networking. - */ -void fastcall __page_cache_release(struct page *page) -{ - if (PageLRU(page)) { - unsigned long flags; - struct zone *zone = page_zone(page); - - spin_lock_irqsave(&zone->lru_lock, flags); - BUG_ON(!PageLRU(page)); - __ClearPageLRU(page); - del_page_from_lru(zone, page); - spin_unlock_irqrestore(&zone->lru_lock, flags); - } - free_hot_page(page); -} -EXPORT_SYMBOL(__page_cache_release); - -/* * Batched page_cache_release(). Decrement the reference count on all the * passed pages. If it fell to zero then remove the page from the LRU and * free it. @@ -284,7 +283,7 @@ void release_pages(struct page **pages, int nr, int cold) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - BUG_ON(!PageLRU(page)); + VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); del_page_from_lru(zone, page); } @@ -337,7 +336,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec) for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); if (put_page_testzero(page)) pagevec_add(&pages_to_free, page); } @@ -364,7 +363,7 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); add_page_to_inactive_list(zone, page); } @@ -391,9 +390,9 @@ void __pagevec_lru_add_active(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - BUG_ON(PageActive(page)); + VM_BUG_ON(PageActive(page)); SetPageActive(page); add_page_to_active_list(zone, page); } diff --git a/mm/swapfile.c b/mm/swapfile.c index f1f5ec783781..a15def63f28f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1723,13 +1723,14 @@ get_swap_info_struct(unsigned type) */ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) { - int ret = 0, i = 1 << page_cluster; + int our_page_cluster = page_cluster; + int ret = 0, i = 1 << our_page_cluster; unsigned long toff; struct swap_info_struct *swapdev = swp_type(entry) + swap_info; - if (!page_cluster) /* no readahead */ + if (!our_page_cluster) /* no readahead */ return 0; - toff = (swp_offset(entry) >> page_cluster) << page_cluster; + toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; if (!toff) /* first page is swap header */ toff++, i--; *offset = toff; diff --git a/mm/truncate.c b/mm/truncate.c index c6ab55ec6883..f4edbc179d14 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/pagevec.h> @@ -16,6 +17,32 @@ do_invalidatepage */ +/** + * do_invalidatepage - invalidate part of all of a page + * @page: the page which is affected + * @offset: the index of the truncation point + * + * do_invalidatepage() is called when all or part of the page has become + * invalidated by a truncate operation. + * + * do_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +void do_invalidatepage(struct page *page, unsigned long offset) +{ + void (*invalidatepage)(struct page *, unsigned long); + invalidatepage = page->mapping->a_ops->invalidatepage; +#ifdef CONFIG_BLOCK + if (!invalidatepage) + invalidatepage = block_invalidatepage; +#endif + if (invalidatepage) + (*invalidatepage)(page, offset); +} + static inline void truncate_partial_page(struct page *page, unsigned partial) { memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); @@ -52,36 +79,26 @@ truncate_complete_page(struct address_space *mapping, struct page *page) /* * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can - * be marked dirty at any time too. So we re-check the dirtiness inside - * ->tree_lock. That provides exclusion against the __set_page_dirty - * functions. + * be marked dirty at any time too, so use remove_mapping which safely + * discards clean, unused pages. * * Returns non-zero if the page was successfully invalidated. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { + int ret; + if (page->mapping != mapping) return 0; if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - write_lock_irq(&mapping->tree_lock); - if (PageDirty(page)) - goto failed; - if (page_count(page) != 2) /* caller's ref + pagecache ref */ - goto failed; - - BUG_ON(PagePrivate(page)); - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + ret = remove_mapping(mapping, page); ClearPageUptodate(page); - page_cache_release(page); /* pagecache ref */ - return 1; -failed: - write_unlock_irq(&mapping->tree_lock); - return 0; + + return ret; } /** @@ -270,9 +287,39 @@ unsigned long invalidate_inode_pages(struct address_space *mapping) { return invalidate_mapping_pages(mapping, 0, ~0UL); } - EXPORT_SYMBOL(invalidate_inode_pages); +/* + * This is like invalidate_complete_page(), except it ignores the page's + * refcount. We do this because invalidate_inode_pages2() needs stronger + * invalidation guarantees, and cannot afford to leave pages behind because + * shrink_list() has a temp ref on them, or because they're transiently sitting + * in the lru_cache_add() pagevecs. + */ +static int +invalidate_complete_page2(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + + write_lock_irq(&mapping->tree_lock); + if (PageDirty(page)) + goto failed; + + BUG_ON(PagePrivate(page)); + __remove_from_page_cache(page); + write_unlock_irq(&mapping->tree_lock); + ClearPageUptodate(page); + page_cache_release(page); /* pagecache ref */ + return 1; +failed: + write_unlock_irq(&mapping->tree_lock); + return 0; +} + /** * invalidate_inode_pages2_range - remove range of pages from an address_space * @mapping: the address_space @@ -339,7 +386,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } } was_dirty = test_clear_page_dirty(page); - if (!invalidate_complete_page(mapping, page)) { + if (!invalidate_complete_page2(mapping, page)) { if (was_dirty) set_page_dirty(page); ret = -EIO; diff --git a/mm/util.c b/mm/util.c index 7368479220b3..e14fa84ef39a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -40,6 +40,24 @@ char *kstrdup(const char *s, gfp_t gfp) } EXPORT_SYMBOL(kstrdup); +/** + * kmemdup - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length + * @gfp: GFP mask to use + */ +void *kmemdup(const void *src, size_t len, gfp_t gfp) +{ + void *p; + + p = ____kmalloc(len, gfp); + if (p) + memcpy(p, src, len); + return p; +} +EXPORT_SYMBOL(kmemdup); + /* * strndup_user - duplicate an existing string from user space * diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 266162d2ba28..1ac191ce5641 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,9 @@ DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node); + static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; @@ -238,7 +241,6 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, /** * get_vm_area - reserve a contingous kernel virtual area - * * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * @@ -270,7 +272,7 @@ static struct vm_struct *__find_vm_area(void *addr) } /* Caller must hold vmlist_lock */ -struct vm_struct *__remove_vm_area(void *addr) +static struct vm_struct *__remove_vm_area(void *addr) { struct vm_struct **p, *tmp; @@ -293,7 +295,6 @@ found: /** * remove_vm_area - find and remove a contingous kernel virtual area - * * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. @@ -352,7 +353,6 @@ void __vunmap(void *addr, int deallocate_pages) /** * vfree - release memory allocated by vmalloc() - * * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, as @@ -370,7 +370,6 @@ EXPORT_SYMBOL(vfree); /** * vunmap - release virtual mapping obtained by vmap() - * * @addr: memory base address * * Free the virtually contiguous memory area starting at @addr, @@ -387,7 +386,6 @@ EXPORT_SYMBOL(vunmap); /** * vmap - map an array of pages into virtually contiguous space - * * @pages: array of page pointers * @count: number of pages to map * @flags: vm_area->flags @@ -468,7 +466,6 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) /** * __vmalloc_node - allocate virtually contiguous memory - * * @size: allocation size * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages @@ -478,8 +475,8 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node) +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node) { struct vm_struct *area; @@ -493,7 +490,6 @@ void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, return __vmalloc_area_node(area, gfp_mask, prot, node); } -EXPORT_SYMBOL(__vmalloc_node); void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { @@ -503,9 +499,7 @@ EXPORT_SYMBOL(__vmalloc); /** * vmalloc - allocate virtually contiguous memory - * * @size: allocation size - * * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * @@ -519,11 +513,11 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** - * vmalloc_user - allocate virtually contiguous memory which has - * been zeroed so it can be mapped to userspace without - * leaking data. + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace + * @size: allocation size * - * @size: allocation size + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. */ void *vmalloc_user(unsigned long size) { @@ -542,7 +536,6 @@ EXPORT_SYMBOL(vmalloc_user); /** * vmalloc_node - allocate memory on a specific node - * * @size: allocation size * @node: numa node * @@ -564,7 +557,6 @@ EXPORT_SYMBOL(vmalloc_node); /** * vmalloc_exec - allocate virtually contiguous, executable memory - * * @size: allocation size * * Kernel-internal function to allocate enough pages to cover @size @@ -582,7 +574,6 @@ void *vmalloc_exec(unsigned long size) /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the @@ -595,11 +586,11 @@ void *vmalloc_32(unsigned long size) EXPORT_SYMBOL(vmalloc_32); /** - * vmalloc_32_user - allocate virtually contiguous memory (32bit - * addressable) which is zeroed so it can be - * mapped to userspace without leaking data. - * + * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be + * mapped to userspace without leaking data. */ void *vmalloc_32_user(unsigned long size) { @@ -693,7 +684,6 @@ finished: /** * remap_vmalloc_range - map vmalloc pages to userspace - * * @vma: vma to cover (map full range of vma) * @addr: vmalloc memory * @pgoff: number of pages into addr before first page to map diff --git a/mm/vmscan.c b/mm/vmscan.c index 5d4c4d02254d..eca70310adb2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -19,6 +19,7 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/highmem.h> +#include <linux/vmstat.h> #include <linux/file.h> #include <linux/writeback.h> #include <linux/blkdev.h> @@ -62,6 +63,8 @@ struct scan_control { int swap_cluster_max; int swappiness; + + int all_unreclaimable; }; /* @@ -368,7 +371,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - + inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -377,15 +380,34 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) int remove_mapping(struct address_space *mapping, struct page *page) { - if (!mapping) - return 0; /* truncate got there first */ + BUG_ON(!PageLocked(page)); + BUG_ON(mapping != page_mapping(page)); write_lock_irq(&mapping->tree_lock); - /* - * The non-racy check for busy page. It is critical to check - * PageDirty _after_ making sure that the page is freeable and - * not in use by anybody. (pagecache + us == 2) + * The non racy check for a busy page. + * + * Must be careful with the order of the tests. When someone has + * a ref to the page, it may be possible that they dirty it then + * drop the reference. So if PageDirty is tested before page_count + * here, then the following race may occur: + * + * get_user_pages(&page); + * [user mapping goes away] + * write_to(page); + * !PageDirty(page) [good] + * SetPageDirty(page); + * put_page(page); + * !page_count(page) [good, discard it] + * + * [oops, our write_to data is lost] + * + * Reversing the order of the tests ensures such a situation cannot + * escape unnoticed. The smp_rmb is needed to ensure the page->flags + * load is not satisfied before that of page->_count. + * + * Note that if SetPageDirty is always performed via set_page_dirty, + * and thus under tree_lock, then this ordering is not required. */ if (unlikely(page_count(page) != 2)) goto cannot_free; @@ -440,7 +462,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (TestSetPageLocked(page)) goto keep; - BUG_ON(PageActive(page)); + VM_BUG_ON(PageActive(page)); sc->nr_scanned++; @@ -547,7 +569,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto free_it; } - if (!remove_mapping(mapping, page)) + if (!mapping || !remove_mapping(mapping, page)) goto keep_locked; free_it: @@ -564,7 +586,7 @@ keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) @@ -603,7 +625,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - BUG_ON(!PageLRU(page)); + VM_BUG_ON(!PageLRU(page)); list_del(&page->lru); target = src; @@ -674,7 +696,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) @@ -695,6 +717,11 @@ done: return nr_reclaimed; } +static inline int zone_is_near_oom(struct zone *zone) +{ + return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3; +} + /* * This moves pages from the active list to the inactive list. * @@ -730,6 +757,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, long distress; long swap_tendency; + if (zone_is_near_oom(zone)) + goto force_reclaim_mapped; + /* * `distress' is a measure of how much trouble we're having * reclaiming pages. 0 -> no problems. 100 -> great trouble. @@ -765,6 +795,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * memory onto the inactive list. */ if (swap_tendency >= 100) +force_reclaim_mapped: reclaim_mapped = 1; } @@ -797,9 +828,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - BUG_ON(!PageActive(page)); + VM_BUG_ON(!PageActive(page)); ClearPageActive(page); list_move(&page->lru, &zone->inactive_list); @@ -827,9 +858,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - BUG_ON(!PageActive(page)); + VM_BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; if (!pagevec_add(&pvec, page)) { @@ -925,6 +956,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, unsigned long nr_reclaimed = 0; int i; + sc->all_unreclaimable = 1; for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; @@ -941,6 +973,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones, if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + sc->all_unreclaimable = 0; + nr_reclaimed += shrink_zone(priority, zone, sc); } return nr_reclaimed; @@ -1021,6 +1055,9 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) if (sc.nr_scanned && priority < DEF_PRIORITY - 2) blk_congestion_wait(WRITE, HZ/10); } + /* top priority shrink_caches still had more to do? don't OOM, then */ + if (!sc.all_unreclaimable) + ret = 1; out: for (i = 0; zones[i] != 0; i++) { struct zone *zone = zones[i]; @@ -1153,7 +1190,7 @@ scan: if (zone->all_unreclaimable) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone->nr_active + zone->nr_inactive) * 4) + (zone->nr_active + zone->nr_inactive) * 6) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -1361,7 +1398,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) for_each_zone(zone) lru_pages += zone->nr_active + zone->nr_inactive; - nr_slab = global_page_state(NR_SLAB); + nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { reclaim_state.reclaimed_slab = 0; @@ -1510,7 +1547,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ -#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -1526,6 +1562,12 @@ int zone_reclaim_mode __read_mostly; int sysctl_min_unmapped_ratio = 1; /* + * If the number of slab pages in a zone grows beyond this percentage then + * slab reclaim needs to occur. + */ +int sysctl_min_slab_ratio = 5; + +/* * Try to free up some pages from this zone through reclaim. */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -1544,6 +1586,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .gfp_mask = gfp_mask, .swappiness = vm_swappiness, }; + unsigned long slab_reclaimable; disable_swap_token(); cond_resched(); @@ -1556,29 +1599,43 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - /* - * Free memory by calling shrink zone with increasing priorities - * until we have enough memory freed. - */ - priority = ZONE_RECLAIM_PRIORITY; - do { - nr_reclaimed += shrink_zone(priority, zone, &sc); - priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) > + zone->min_unmapped_pages) { + /* + * Free memory by calling shrink zone with increasing + * priorities until we have enough memory freed. + */ + priority = ZONE_RECLAIM_PRIORITY; + do { + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); + } - if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (slab_reclaimable > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we just shake the slab - * a bit and then go off node for this particular allocation - * despite possibly having freed enough memory to allocate in - * this zone. If we freed local memory then the next - * allocations will be local again. + * many pages were freed in this zone. So we take the current + * number of slab pages and shake the slab until it is reduced + * by the same nr_pages that we used for reclaiming unmapped + * pages. * - * shrink_slab will free memory on all zones and may take - * a long time. + * Note that shrink_slab will free memory on all zones and may + * take a long time. + */ + while (shrink_slab(sc.nr_scanned, gfp_mask, order) && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) > + slab_reclaimable - nr_pages) + ; + + /* + * Update nr_reclaimed by the number of slab pages we + * reclaimed from this zone. */ - shrink_slab(sc.nr_scanned, gfp_mask, order); + nr_reclaimed += slab_reclaimable - + zone_page_state(zone, NR_SLAB_RECLAIMABLE); } p->reclaim_state = NULL; @@ -1592,7 +1649,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int node_id; /* - * Zone reclaim reclaims unmapped file backed pages. + * Zone reclaim reclaims unmapped file backed pages and + * slab pages if we are over the defined limits. * * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately @@ -1601,7 +1659,9 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio) + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages + && zone_page_state(zone, NR_SLAB_RECLAIMABLE) + <= zone->min_slab_pages) return 0; /* @@ -1621,7 +1681,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * over remote processors and spread off node memory allocations * as wide as possible. */ - node_id = zone->zone_pgdat->node_id; + node_id = zone_to_nid(zone); mask = node_to_cpumask(node_id); if (!cpus_empty(mask) && node_id != numa_node_id()) return 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index c1b5f4106b38..a2b6a9f96e5c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -321,6 +321,9 @@ void refresh_cpu_vm_stats(int cpu) for_each_zone(zone) { struct per_cpu_pageset *pcp; + if (!populated_zone(zone)) + continue; + pcp = zone_pcp(zone, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) @@ -368,7 +371,7 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z) __inc_zone_state(z, NUMA_MISS); __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); } - if (z->zone_pgdat == NODE_DATA(numa_node_id())) + if (z->node == numa_node_id()) __inc_zone_state(z, NUMA_LOCAL); else __inc_zone_state(z, NUMA_OTHER); @@ -435,17 +438,34 @@ struct seq_operations fragmentation_op = { .show = frag_show, }; +#ifdef CONFIG_ZONE_DMA32 +#define TEXT_FOR_DMA32(xx) xx "_dma32", +#else +#define TEXT_FOR_DMA32(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#else +#define TEXT_FOR_HIGHMEM(xx) +#endif + +#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ + TEXT_FOR_HIGHMEM(xx) + static char *vmstat_text[] = { /* Zoned VM counters */ "nr_anon_pages", "nr_mapped", "nr_file_pages", - "nr_slab", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", "nr_page_table_pages", "nr_dirty", "nr_writeback", "nr_unstable", "nr_bounce", + "nr_vmscan_write", #ifdef CONFIG_NUMA "numa_hit", @@ -462,10 +482,7 @@ static char *vmstat_text[] = { "pswpin", "pswpout", - "pgalloc_dma", - "pgalloc_dma32", - "pgalloc_normal", - "pgalloc_high", + TEXTS_FOR_ZONES("pgalloc") "pgfree", "pgactivate", @@ -474,25 +491,10 @@ static char *vmstat_text[] = { "pgfault", "pgmajfault", - "pgrefill_dma", - "pgrefill_dma32", - "pgrefill_normal", - "pgrefill_high", - - "pgsteal_dma", - "pgsteal_dma32", - "pgsteal_normal", - "pgsteal_high", - - "pgscan_kswapd_dma", - "pgscan_kswapd_dma32", - "pgscan_kswapd_normal", - "pgscan_kswapd_high", - - "pgscan_direct_dma", - "pgscan_direct_dma32", - "pgscan_direct_normal", - "pgscan_direct_high", + TEXTS_FOR_ZONES("pgrefill") + TEXTS_FOR_ZONES("pgsteal") + TEXTS_FOR_ZONES("pgscan_kswapd") + TEXTS_FOR_ZONES("pgscan_direct") "pginodesteal", "slabs_scanned", |