summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2012-10-28 19:28:52 +0100
committerJiri Kosina <jkosina@suse.cz>2012-10-28 19:29:19 +0100
commit3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a (patch)
tree0058693cc9e70b7461dae551f8a19aff2efd13ca /mm
parentf16f84937d769c893492160b1a8c3672e3992beb (diff)
parente657e078d3dfa9f96976db7a2b5fd7d7c9f1f1a6 (diff)
downloadlinux-stable-3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a.tar.gz
linux-stable-3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a.tar.bz2
linux-stable-3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a.zip
Merge branch 'master' into for-next
Sync up with Linus' tree to be able to apply Cesar's patch against newer version of the code. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c50
-rw-r--r--mm/bootmem.c10
-rw-r--r--mm/compaction.c562
-rw-r--r--mm/fadvise.c34
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c10
-rw-r--r--mm/fremap.c19
-rw-r--r--mm/frontswap.c34
-rw-r--r--mm/huge_memory.c441
-rw-r--r--mm/hugetlb.c34
-rw-r--r--mm/internal.h52
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/kmemleak.c106
-rw-r--r--mm/ksm.c40
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memblock.c29
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c115
-rw-r--r--mm/memory_hotplug.c77
-rw-r--r--mm/mempolicy.c153
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c210
-rw-r--r--mm/mmu_notifier.c89
-rw-r--r--mm/mremap.c73
-rw-r--r--mm/nobootmem.c5
-rw-r--r--mm/nommu.c39
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c14
-rw-r--r--mm/page_alloc.c319
-rw-r--r--mm/page_isolation.c43
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/pgtable-generic.c50
-rw-r--r--mm/prio_tree.c208
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/rmap.c179
-rw-r--r--mm/shmem.c180
-rw-r--r--mm/slab.c350
-rw-r--r--mm/slab.h19
-rw-r--r--mm/slab_common.c162
-rw-r--r--mm/slob.c97
-rw-r--r--mm/slub.c208
-rw-r--r--mm/swap.c13
-rw-r--r--mm/swapfile.c11
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/util.c35
-rw-r--r--mm/vmalloc.c5
-rw-r--r--mm/vmscan.c111
-rw-r--r--mm/vmstat.c16
51 files changed, 2446 insertions, 1976 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
# support for memory compaction
config COMPACTION
bool "Allow for memory compaction"
+ def_bool y
select MIGRATION
depends on MMU
help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on X86 && MMU
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
help
Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
- prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
+ util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o $(mmu-y)
+ compaction.o interval_tree.o $(mmu-y)
obj-y += init-mm.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b41823cc05e6..d3ca2b3ee176 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev,
const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned long read_ahead_kb;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
- read_ahead_kb = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
- ret = count;
- }
- return ret;
+ ret = kstrtoul(buf, 10, &read_ahead_kb);
+ if (ret < 0)
+ return ret;
+
+ bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
+
+ return count;
}
#define K(pages) ((pages) << (PAGE_SHIFT - 10))
@@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned int ratio;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_min_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
- ratio = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- ret = bdi_set_min_ratio(bdi, ratio);
- if (!ret)
- ret = count;
- }
return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio)
@@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
struct backing_dev_info *bdi = dev_get_drvdata(dev);
- char *end;
unsigned int ratio;
- ssize_t ret = -EINVAL;
+ ssize_t ret;
+
+ ret = kstrtouint(buf, 10, &ratio);
+ if (ret < 0)
+ return ret;
+
+ ret = bdi_set_max_ratio(bdi, ratio);
+ if (!ret)
+ ret = count;
- ratio = simple_strtoul(buf, &end, 10);
- if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) {
- ret = bdi_set_max_ratio(bdi, ratio);
- if (!ret)
- ret = count;
- }
return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
int order = ilog2(BITS_PER_LONG);
__free_pages_bootmem(pfn_to_page(start), order);
+ fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
+ start, start + BITS_PER_LONG);
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
if (vec & 1) {
page = pfn_to_page(start + off);
__free_pages_bootmem(page, 0);
+ fixup_zone_present_pages(
+ page_to_nid(page),
+ start + off, start + off + 1);
count++;
}
vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
pages = bdata->node_low_pfn - bdata->node_min_pfn;
pages = bootmem_bootmap_pages(pages);
count += pages;
- while (pages--)
+ while (pages--) {
+ fixup_zone_present_pages(page_to_nid(page),
+ page_to_pfn(page), page_to_pfn(page) + 1);
__free_pages_bootmem(page++, 0);
+ }
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..9eef55838fca 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
+#ifdef CONFIG_COMPACTION
+/* Returns true if the pageblock should be scanned for pages to isolate. */
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ if (cc->ignore_skip_hint)
+ return true;
+
+ return !get_pageblock_skip(page);
+}
+
+/*
+ * This function is called to clear all cached information on pageblocks that
+ * should be skipped for page isolation when the migrate and free page scanner
+ * meet.
+ */
+static void __reset_isolation_suitable(struct zone *zone)
+{
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ unsigned long pfn;
+
+ zone->compact_cached_migrate_pfn = start_pfn;
+ zone->compact_cached_free_pfn = end_pfn;
+ zone->compact_blockskip_flush = false;
+
+ /* Walk the zone and mark every pageblock as suitable for isolation */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ struct page *page;
+
+ cond_resched();
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ continue;
+
+ clear_pageblock_skip(page);
+ }
+}
+
+void reset_isolation_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ /* Only flush if a full compaction finished recently */
+ if (zone->compact_blockskip_flush)
+ __reset_isolation_suitable(zone);
+ }
+}
+
+/*
+ * If no pages were isolated then mark this pageblock to be skipped in the
+ * future. The information is later cleared by __reset_isolation_suitable().
+ */
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool migrate_scanner)
+{
+ struct zone *zone = cc->zone;
+ if (!page)
+ return;
+
+ if (!nr_isolated) {
+ unsigned long pfn = page_to_pfn(page);
+ set_pageblock_skip(page);
+
+ /* Update where compaction should restart */
+ if (migrate_scanner) {
+ if (!cc->finished_update_migrate &&
+ pfn > zone->compact_cached_migrate_pfn)
+ zone->compact_cached_migrate_pfn = pfn;
+ } else {
+ if (!cc->finished_update_free &&
+ pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
+ }
+ }
+}
+#else
+static inline bool isolation_suitable(struct compact_control *cc,
+ struct page *page)
+{
+ return true;
+}
+
+static void update_pageblock_skip(struct compact_control *cc,
+ struct page *page, unsigned long nr_isolated,
+ bool migrate_scanner)
+{
+}
+#endif /* CONFIG_COMPACTION */
+
+static inline bool should_release_lock(spinlock_t *lock)
+{
+ return need_resched() || spin_is_contended(lock);
+}
+
/*
* Compaction requires the taking of some coarse locks that are potentially
* very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
bool locked, struct compact_control *cc)
{
- if (need_resched() || spin_is_contended(lock)) {
+ if (should_release_lock(lock)) {
if (locked) {
spin_unlock_irqrestore(lock, *flags);
locked = false;
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
/* async aborts if taking too long or contended */
if (!cc->sync) {
- if (cc->contended)
- *cc->contended = true;
+ cc->contended = true;
return false;
}
cond_resched();
- if (fatal_signal_pending(current))
- return false;
}
if (!locked)
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
return compact_checklock_irqsave(lock, flags, false, cc);
}
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+ int migratetype = get_pageblock_migratetype(page);
+
+ /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+ return false;
+
+ /* If the page is a large free page, then allow migration */
+ if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ return true;
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (migrate_async_suitable(migratetype))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
+static void compact_capture_page(struct compact_control *cc)
+{
+ unsigned long flags;
+ int mtype, mtype_low, mtype_high;
+
+ if (!cc->page || *cc->page)
+ return;
+
+ /*
+ * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
+ * regardless of the migratetype of the freelist is is captured from.
+ * This is fine because the order for a high-order MIGRATE_MOVABLE
+ * allocation is typically at least a pageblock size and overall
+ * fragmentation is not impaired. Other allocation types must
+ * capture pages from their own migratelist because otherwise they
+ * could pollute other pageblocks like MIGRATE_MOVABLE with
+ * difficult to move pages and making fragmentation worse overall.
+ */
+ if (cc->migratetype == MIGRATE_MOVABLE) {
+ mtype_low = 0;
+ mtype_high = MIGRATE_PCPTYPES;
+ } else {
+ mtype_low = cc->migratetype;
+ mtype_high = cc->migratetype + 1;
+ }
+
+ /* Speculatively examine the free lists without zone lock */
+ for (mtype = mtype_low; mtype < mtype_high; mtype++) {
+ int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct page *page;
+ struct free_area *area;
+ area = &(cc->zone->free_area[order]);
+ if (list_empty(&area->free_list[mtype]))
+ continue;
+
+ /* Take the lock and attempt capture of the page */
+ if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
+ return;
+ if (!list_empty(&area->free_list[mtype])) {
+ page = list_entry(area->free_list[mtype].next,
+ struct page, lru);
+ if (capture_free_page(page, cc->order, mtype)) {
+ spin_unlock_irqrestore(&cc->zone->lock,
+ flags);
+ *cc->page = page;
+ return;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+ }
+}
+
/*
* Isolate free pages onto a private freelist. Caller must hold zone->lock.
* If @strict is true, will abort returning 0 on any invalid PFNs or non-free
* pages inside of the pageblock (even though it may still end up isolating
* some pages).
*/
-static unsigned long isolate_freepages_block(unsigned long blockpfn,
+static unsigned long isolate_freepages_block(struct compact_control *cc,
+ unsigned long blockpfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor;
+ struct page *cursor, *valid_page = NULL;
+ unsigned long nr_strict_required = end_pfn - blockpfn;
+ unsigned long flags;
+ bool locked = false;
cursor = pfn_to_page(blockpfn);
- /* Isolate free pages. This assumes the block is valid */
+ /* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
int isolated, i;
struct page *page = cursor;
- if (!pfn_valid_within(blockpfn)) {
- if (strict)
- return 0;
- continue;
- }
nr_scanned++;
+ if (!pfn_valid_within(blockpfn))
+ continue;
+ if (!valid_page)
+ valid_page = page;
+ if (!PageBuddy(page))
+ continue;
- if (!PageBuddy(page)) {
- if (strict)
- return 0;
+ /*
+ * The zone lock must be held to isolate freepages.
+ * Unfortunately this is a very coarse lock and can be
+ * heavily contended if there are parallel allocations
+ * or parallel compactions. For async compaction do not
+ * spin on the lock and we acquire the lock as late as
+ * possible.
+ */
+ locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
+ locked, cc);
+ if (!locked)
+ break;
+
+ /* Recheck this is a suitable migration target under lock */
+ if (!strict && !suitable_migration_target(page))
+ break;
+
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
continue;
- }
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
if (!isolated && strict)
- return 0;
+ break;
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
+
+ /*
+ * If strict isolation is requested by CMA then check that all the
+ * pages requested were isolated. If there were any failures, 0 is
+ * returned and CMA will fail.
+ */
+ if (strict && nr_strict_required > total_isolated)
+ total_isolated = 0;
+
+ if (locked)
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (blockpfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, total_isolated, false);
+
return total_isolated;
}
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
* a free page).
*/
unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn, flags;
- struct zone *zone = NULL;
+ unsigned long isolated, pfn, block_end_pfn;
LIST_HEAD(freelist);
- if (pfn_valid(start_pfn))
- zone = page_zone(pfn_to_page(start_pfn));
-
for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
- if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
+ if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
break;
/*
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- spin_lock_irqsave(&zone->lock, flags);
- isolated = isolate_freepages_block(pfn, block_end_pfn,
+ isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
&freelist, true);
- spin_unlock_irqrestore(&zone->lock, flags);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
* @cc: Compaction control structure.
* @low_pfn: The first PFN of the range.
* @end_pfn: The one-past-the-last PFN of the range.
+ * @unevictable: true if it allows to isolate unevictable pages
*
* Isolate all pages that can be migrated from the range specified by
* [low_pfn, end_pfn). Returns zero if there is a fatal signal
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
*/
unsigned long
isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn)
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
{
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
- bool locked;
+ bool locked = false;
+ struct page *page = NULL, *valid_page = NULL;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* Time to isolate some pages for migration */
cond_resched();
- spin_lock_irqsave(&zone->lru_lock, flags);
- locked = true;
for (; low_pfn < end_pfn; low_pfn++) {
- struct page *page;
-
/* give a chance to irqs before checking need_resched() */
- if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- locked = false;
+ if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (should_release_lock(&zone->lru_lock)) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ locked = false;
+ }
}
- /* Check if it is ok to still hold the lock */
- locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
- locked, cc);
- if (!locked)
- break;
-
/*
* migrate_pfn does not necessarily start aligned to a
* pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (page_zone(page) != zone)
continue;
+ if (!valid_page)
+ valid_page = page;
+
+ /* If isolation recently failed, do not retry */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
/* Skip if free */
if (PageBuddy(page))
continue;
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* migration is optimistic to see if the minimum amount of work
* satisfies the allocation
*/
- pageblock_nr = low_pfn >> pageblock_order;
if (!cc->sync && last_pageblock_nr != pageblock_nr &&
!migrate_async_suitable(get_pageblock_migratetype(page))) {
- low_pfn += pageblock_nr_pages;
- low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
- last_pageblock_nr = pageblock_nr;
- continue;
+ cc->finished_update_migrate = true;
+ goto next_pageblock;
}
+ /* Check may be lockless but that's ok as we recheck later */
if (!PageLRU(page))
continue;
/*
- * PageLRU is set, and lru_lock excludes isolation,
- * splitting and collapsing (collapsing has already
- * happened if PageLRU is set).
+ * PageLRU is set. lru_lock normally excludes isolation
+ * splitting and collapsing (collapsing has already happened
+ * if PageLRU is set) but the lock is not necessarily taken
+ * here and it is wasteful to take it just to check transhuge.
+ * Check TransHuge without lock and skip the whole pageblock if
+ * it's either a transhuge or hugetlbfs page, as calling
+ * compound_order() without preventing THP from splitting the
+ * page underneath us may return surprising results.
*/
if (PageTransHuge(page)) {
+ if (!locked)
+ goto next_pageblock;
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
+ /* Check if it is ok to still hold the lock */
+ locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
+ locked, cc);
+ if (!locked || fatal_signal_pending(current))
+ break;
+
+ /* Recheck PageLRU and PageTransHuge under lock */
+ if (!PageLRU(page))
+ continue;
+ if (PageTransHuge(page)) {
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (!cc->sync)
mode |= ISOLATE_ASYNC_MIGRATE;
+ if (unevictable)
+ mode |= ISOLATE_UNEVICTABLE;
+
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
+ cc->finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
++low_pfn;
break;
}
+
+ continue;
+
+next_pageblock:
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
+ /* Update the pageblock-skip if the whole pageblock was scanned */
+ if (low_pfn == end_pfn)
+ update_pageblock_skip(cc, valid_page, nr_isolated, true);
+
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
return low_pfn;
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
-
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
-
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
- return false;
-
- /* If the page is a large free page, then allow migration */
- if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
-
- /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(migratetype))
- return true;
-
- /* Otherwise skip the block */
- return false;
-}
-
-/*
- * Returns the start pfn of the last page block in a zone. This is the starting
- * point for full compaction of a zone. Compaction searches for free pages from
- * the end of each zone, while isolate_freepages_block scans forward inside each
- * page block.
- */
-static unsigned long start_free_pfn(struct zone *zone)
-{
- unsigned long free_pfn;
- free_pfn = zone->zone_start_pfn + zone->spanned_pages;
- free_pfn &= ~(pageblock_nr_pages-1);
- return free_pfn;
-}
-
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
{
struct page *page;
unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
- unsigned long flags;
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone,
if (!suitable_migration_target(page))
continue;
- /*
- * Found a block suitable for isolating free pages from. Now
- * we disabled interrupts, double check things are ok and
- * isolate the pages. This is to minimise the time IRQs
- * are disabled
- */
- isolated = 0;
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
- /*
- * The zone lock must be held to isolate freepages. This
- * unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock
- */
- if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
- break;
- if (suitable_migration_target(page)) {
- end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
- isolated = isolate_freepages_block(pfn, end_pfn,
- freelist, false);
- nr_freepages += isolated;
- }
- spin_unlock_irqrestore(&zone->lock, flags);
+ /* Found a block suitable for isolating free pages from */
+ isolated = 0;
+ end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+ isolated = isolate_freepages_block(cc, pfn, end_pfn,
+ freelist, false);
+ nr_freepages += isolated;
/*
* Record the highest PFN we isolated pages from. When next
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone,
* page migration may have returned some pages to the allocator
*/
if (isolated) {
+ cc->finished_update_free = true;
high_pfn = max(high_pfn, pfn);
-
- /*
- * If the free scanner has wrapped, update
- * compact_cached_free_pfn to point to the highest
- * pageblock with free pages. This reduces excessive
- * scanning of full pageblocks near the end of the
- * zone
- */
- if (cc->order > 0 && cc->wrapped)
- zone->compact_cached_free_pfn = high_pfn;
}
}
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone,
cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
-
- /* If compact_cached_free_pfn is reset then set it now */
- if (cc->order > 0 && !cc->wrapped &&
- zone->compact_cached_free_pfn == start_free_pfn(zone))
- zone->compact_cached_free_pfn = high_pfn;
}
/*
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
}
/* Perform the isolation */
- low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
- if (!low_pfn)
+ low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
+ if (!low_pfn || cc->contended)
return ISOLATE_ABORT;
cc->migrate_pfn = low_pfn;
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
- unsigned int order;
unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
- /*
- * A full (order == -1) compaction run starts at the beginning and
- * end of a zone; it completes when the migrate and free scanner meet.
- * A partial (order > 0) compaction can start with the free scanner
- * at a random point in the zone, and may have to restart.
- */
+ /* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn) {
- if (cc->order > 0 && !cc->wrapped) {
- /* We started partway through; restart at the end. */
- unsigned long free_pfn = start_free_pfn(zone);
- zone->compact_cached_free_pfn = free_pfn;
- cc->free_pfn = free_pfn;
- cc->wrapped = 1;
- return COMPACT_CONTINUE;
- }
- return COMPACT_COMPLETE;
- }
+ /*
+ * Mark that the PG_migrate_skip information should be cleared
+ * by kswapd when it goes to sleep. kswapd does not set the
+ * flag itself as the decision to be clear should be directly
+ * based on an allocation request.
+ */
+ if (!current_is_kswapd())
+ zone->compact_blockskip_flush = true;
- /* We wrapped around and ended up where we started. */
- if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
return COMPACT_COMPLETE;
+ }
/*
* order == -1 is expected when compacting via
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
- for (order = cc->order; order < MAX_ORDER; order++) {
- /* Job done if page is free of the right migratetype */
- if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
- return COMPACT_PARTIAL;
-
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && zone->free_area[order].nr_free)
+ if (cc->page) {
+ /* Was a suitable page captured? */
+ if (*cc->page)
return COMPACT_PARTIAL;
+ } else {
+ unsigned int order;
+ for (order = cc->order; order < MAX_ORDER; order++) {
+ struct free_area *area = &zone->free_area[cc->order];
+ /* Job done if page is free of the right migratetype */
+ if (!list_empty(&area->free_list[cc->migratetype]))
+ return COMPACT_PARTIAL;
+
+ /* Job done if allocation would set block type */
+ if (cc->order >= pageblock_order && area->nr_free)
+ return COMPACT_PARTIAL;
+ }
}
return COMPACT_CONTINUE;
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
ret = compaction_suitable(zone, cc->order);
switch (ret) {
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- /* Setup to move all movable pages to the end of the zone */
- cc->migrate_pfn = zone->zone_start_pfn;
-
- if (cc->order > 0) {
- /* Incremental compaction. Start where the last one stopped. */
- cc->free_pfn = zone->compact_cached_free_pfn;
- cc->start_free_pfn = cc->free_pfn;
- } else {
- /* Order == -1 starts at the end of the zone. */
- cc->free_pfn = start_free_pfn(zone);
+ /*
+ * Setup to move all movable pages to the end of the zone. Used cached
+ * information on where the scanners should start but check that it
+ * is initialised by ensuring the values are within zone boundaries.
+ */
+ cc->migrate_pfn = zone->compact_cached_migrate_pfn;
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
+ cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ zone->compact_cached_free_pfn = cc->free_pfn;
+ }
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ cc->migrate_pfn = start_pfn;
+ zone->compact_cached_migrate_pfn = cc->migrate_pfn;
}
+ /*
+ * Clear pageblock skip if there were failures recently and compaction
+ * is about to be retried after being deferred. kswapd does not do
+ * this reset as it'll reset the cached information when going to sleep.
+ */
+ if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ __reset_isolation_suitable(zone);
+
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
+ putback_lru_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
goto out;
case ISOLATE_NONE:
continue;
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
goto out;
}
}
+
+ /* Capture a page now if it is a suitable size */
+ compact_capture_page(cc);
}
out:
@@ -829,8 +1025,10 @@ out:
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
- bool sync, bool *contended)
+ bool sync, bool *contended,
+ struct page **page)
{
+ unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
- .contended = contended,
+ .page = page,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
- return compact_zone(zone, &cc);
+ ret = compact_zone(zone, &cc);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+
+ *contended = cc.contended;
+ return ret;
}
int sysctl_extfrag_threshold = 500;
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
* @sync: Whether migration is synchronous or not
+ * @contended: Return value that is true if compaction was aborted due to lock contention
+ * @page: Optionally capture a free page of the requested order during compaction
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- bool sync, bool *contended)
+ bool sync, bool *contended, struct page **page)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
+ int alloc_flags = 0;
- /*
- * Check whether it is worth even starting compaction. The order check is
- * made because an assumption is made that the page allocator can satisfy
- * the "cheaper" orders without taking special steps
- */
+ /* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
int status;
status = compact_zone_order(zone, order, gfp_mask, sync,
- contended);
+ contended, page);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
+ alloc_flags))
break;
}
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
struct compact_control cc = {
.order = order,
.sync = false,
+ .page = NULL,
};
return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1159,7 @@ static int compact_node(int nid)
struct compact_control cc = {
.order = -1,
.sync = true,
+ .page = NULL,
};
return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 9b75a045dbf4..a47f0f50c89f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,7 +26,7 @@
*/
SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
{
- struct file *file = fget(fd);
+ struct fd f = fdget(fd);
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
@@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
unsigned long nrpages;
int ret = 0;
- if (!file)
+ if (!f.file)
return -EBADF;
- if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+ if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
- mapping = file->f_mapping;
+ mapping = f.file->f_mapping;
if (!mapping || len < 0) {
ret = -EINVAL;
goto out;
@@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
switch (advice) {
case POSIX_FADV_NORMAL:
- file->f_ra.ra_pages = bdi->ra_pages;
- spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
- spin_unlock(&file->f_lock);
+ f.file->f_ra.ra_pages = bdi->ra_pages;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_RANDOM:
- spin_lock(&file->f_lock);
- file->f_mode |= FMODE_RANDOM;
- spin_unlock(&file->f_lock);
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode |= FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_SEQUENTIAL:
- file->f_ra.ra_pages = bdi->ra_pages * 2;
- spin_lock(&file->f_lock);
- file->f_mode &= ~FMODE_RANDOM;
- spin_unlock(&file->f_lock);
+ f.file->f_ra.ra_pages = bdi->ra_pages * 2;
+ spin_lock(&f.file->f_lock);
+ f.file->f_mode &= ~FMODE_RANDOM;
+ spin_unlock(&f.file->f_lock);
break;
case POSIX_FADV_WILLNEED:
/* First and last PARTIAL page! */
@@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
* Ignore return value because fadvise() shall return
* success even if filesystem can't retrieve a hint,
*/
- force_page_cache_readahead(mapping, file, start_index,
+ force_page_cache_readahead(mapping, f.file, start_index,
nrpages);
break;
case POSIX_FADV_NOREUSE:
@@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
ret = -EINVAL;
}
out:
- fput(file);
+ fdput(f);
return ret;
}
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
- if (likely(page)) {
+ if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
* We found the page, so try async readahead before
* waiting for the lock.
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
- } else {
+ } else if (!page) {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
/* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
{
struct vm_area_struct *vma;
struct mm_struct *mm;
- struct prio_tree_iter iter;
unsigned long address;
pte_t *pte;
pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
retry:
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
mm = vma->vm_mm;
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
if (pte) {
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
page_remove_rmap(page);
dec_mm_counter(mm, MM_FILEPAGES);
BUG_ON(pte_dirty(pteval));
pte_unmap_unlock(pte, ptl);
+ /* must invalidate_page _before_ freeing the page */
+ mmu_notifier_invalidate_page(mm, address);
page_cache_release(page);
}
}
@@ -305,6 +306,7 @@ out:
static const struct vm_operations_struct xip_file_vm_ops = {
.fault = xip_file_fault,
.page_mkwrite = filemap_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
file_accessed(file);
vma->vm_ops = &xip_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
+ vma->vm_flags |= VM_MIXEDMAP;
return 0;
}
EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd432467..a0aaf0e56800 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
*
* started by Ingo Molnar, Copyright (C) 2002, 2003
*/
+#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
return err;
}
-static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, unsigned long size, pgoff_t pgoff)
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
{
+ struct mm_struct *mm = vma->vm_mm;
int err;
do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff++;
} while (size);
- return 0;
-
+ return 0;
}
+EXPORT_SYMBOL(generic_file_remap_pages);
/**
* sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
goto out;
- if (!(vma->vm_flags & VM_CAN_NONLINEAR))
+ if (!vma->vm_ops || !vma->vm_ops->remap_pages)
goto out;
if (start < vma->vm_start || start + size > vma->vm_end)
@@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
*/
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
- struct file *file = vma->vm_file;
+ struct file *file = get_file(vma->vm_file);
flags &= MAP_NONBLOCK;
- get_file(file);
addr = mmap_region(file, start, size,
flags, vma->vm_flags, pgoff);
fput(file);
@@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
vma->vm_flags |= VM_NONLINEAR;
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
@@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
mmu_notifier_invalidate_range_start(mm, start, start + size);
- err = populate_range(mm, vma, start, size, pgoff);
+ err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
mmu_notifier_invalidate_range_end(mm, start, start + size);
if (!err && !(flags & MAP_NONBLOCK)) {
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6b3e71a2cd48..2890e67d6026 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled);
*/
static bool frontswap_writethrough_enabled __read_mostly;
+/*
+ * If enabled, the underlying tmem implementation is capable of doing
+ * exclusive gets, so frontswap_load, on a successful tmem_get must
+ * mark the page as no longer in frontswap AND mark it dirty.
+ */
+static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+
#ifdef CONFIG_DEBUG_FS
/*
* Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable)
EXPORT_SYMBOL(frontswap_writethrough);
/*
+ * Enable/disable frontswap exclusive gets (see above).
+ */
+void frontswap_tmem_exclusive_gets(bool enable)
+{
+ frontswap_tmem_exclusive_gets_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
+
+/*
* Called when a swap device is swapon'd.
*/
void __frontswap_init(unsigned type)
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page)
BUG_ON(sis == NULL);
if (frontswap_test(sis, offset))
ret = frontswap_ops.load(type, offset, page);
- if (ret == 0)
+ if (ret == 0) {
inc_frontswap_loads();
+ if (frontswap_tmem_exclusive_gets_enabled) {
+ SetPageDirty(page);
+ frontswap_clear(sis, offset);
+ }
+ }
return ret;
}
EXPORT_SYMBOL(__frontswap_load);
@@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
return ret;
}
+/*
+ * Used to check if it's necessory and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shink pages,
+ * error code when there is an error.
+ */
static int __frontswap_shrink(unsigned long target_pages,
unsigned long *pages_to_unuse,
int *type)
@@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages,
if (total_pages <= target_pages) {
/* Nothing to do */
*pages_to_unuse = 0;
- return 0;
+ return 1;
}
total_pages_to_unuse = total_pages - target_pages;
return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
@@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages,
void frontswap_shrink(unsigned long target_pages)
{
unsigned long pages_to_unuse = 0;
- int type, ret;
+ int uninitialized_var(type), ret;
/*
* we don't want to hold swap_lock while doing a very
@@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages)
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
spin_unlock(&swap_lock);
- if (ret == 0 && pages_to_unuse)
+ if (ret == 0)
try_to_unuse(type, true, pages_to_unuse);
return;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..40f17c34b415 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -17,6 +17,7 @@
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
+#include <linux/pagemap.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
@@ -102,10 +103,7 @@ static int set_recommended_min_free_kbytes(void)
unsigned long recommended_min;
extern int min_free_kbytes;
- if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
- &transparent_hugepage_flags) &&
- !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- &transparent_hugepage_flags))
+ if (!khugepaged_enabled())
return 0;
for_each_populated_zone(zone)
@@ -139,12 +137,6 @@ static int start_khugepaged(void)
{
int err = 0;
if (khugepaged_enabled()) {
- int wakeup;
- if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
- err = -ENOMEM;
- goto out;
- }
- mutex_lock(&khugepaged_mutex);
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -154,16 +146,16 @@ static int start_khugepaged(void)
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
}
- wakeup = !list_empty(&khugepaged_scan.mm_head);
- mutex_unlock(&khugepaged_mutex);
- if (wakeup)
+
+ if (!list_empty(&khugepaged_scan.mm_head))
wake_up_interruptible(&khugepaged_wait);
set_recommended_min_free_kbytes();
- } else
- /* wakeup to exit */
- wake_up_interruptible(&khugepaged_wait);
-out:
+ } else if (khugepaged_thread) {
+ kthread_stop(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+
return err;
}
@@ -224,18 +216,16 @@ static ssize_t enabled_store(struct kobject *kobj,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
if (ret > 0) {
- int err = start_khugepaged();
+ int err;
+
+ mutex_lock(&khugepaged_mutex);
+ err = start_khugepaged();
+ mutex_unlock(&khugepaged_mutex);
+
if (err)
ret = err;
}
- if (ret > 0 &&
- (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
- &transparent_hugepage_flags) ||
- test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
- &transparent_hugepage_flags)))
- set_recommended_min_free_kbytes();
-
return ret;
}
static struct kobj_attribute enabled_attr =
@@ -570,8 +560,6 @@ static int __init hugepage_init(void)
start_khugepaged();
- set_recommended_min_free_kbytes();
-
return 0;
out:
hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +599,6 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
- struct mm_struct *mm)
-{
- assert_spin_locked(&mm->page_table_lock);
-
- /* FIFO */
- if (!mm->pmd_huge_pte)
- INIT_LIST_HEAD(&pgtable->lru);
- else
- list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
- mm->pmd_huge_pte = pgtable;
-}
-
static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +640,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
page_add_new_anon_rmap(page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
- prepare_pmd_huge_pte(pgtable, mm);
+ pgtable_trans_huge_deposit(mm, pgtable);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm->nr_ptes++;
spin_unlock(&mm->page_table_lock);
@@ -791,7 +766,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- prepare_pmd_huge_pte(pgtable, dst_mm);
+ pgtable_trans_huge_deposit(dst_mm, pgtable);
dst_mm->nr_ptes++;
ret = 0;
@@ -802,25 +777,6 @@ out:
return ret;
}
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
- pgtable_t pgtable;
-
- assert_spin_locked(&mm->page_table_lock);
-
- /* FIFO */
- pgtable = mm->pmd_huge_pte;
- if (list_empty(&pgtable->lru))
- mm->pmd_huge_pte = NULL;
- else {
- mm->pmd_huge_pte = list_entry(pgtable->lru.next,
- struct page, lru);
- list_del(&pgtable->lru);
- }
- return pgtable;
-}
-
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -832,6 +788,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
pmd_t _pmd;
int ret = 0, i;
struct page **pages;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
GFP_KERNEL);
@@ -868,15 +826,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
cond_resched();
}
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_clear_flush(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
- pgtable = get_pmd_huge_pte(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +858,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
page_remove_rmap(page);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -904,6 +868,7 @@ out:
out_free_pages:
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +885,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
int ret = 0;
struct page *page, *new_page;
unsigned long haddr;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
VM_BUG_ON(!vma->anon_vma);
spin_lock(&mm->page_table_lock);
@@ -934,7 +901,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache_pmd(vma, address, pmd);
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
@@ -970,38 +937,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
+ mmun_start = haddr;
+ mmun_end = haddr + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
spin_lock(&mm->page_table_lock);
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
- goto out;
+ goto out_mn;
} else {
pmd_t entry;
VM_BUG_ON(!PageHead(page));
entry = mk_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_clear_flush(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
set_pmd_at(mm, haddr, pmd, entry);
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache_pmd(vma, address, pmd);
page_remove_rmap(page);
put_page(page);
ret |= VM_FAULT_WRITE;
}
-out_unlock:
spin_unlock(&mm->page_table_lock);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ return ret;
}
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned long addr,
pmd_t *pmd,
unsigned int flags)
{
+ struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +1000,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
}
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain();
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON(!PageCompound(page));
if (flags & FOLL_GET)
@@ -1041,9 +1025,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
struct page *page;
pgtable_t pgtable;
- pgtable = get_pmd_huge_pte(tlb->mm);
- page = pmd_page(*pmd);
- pmd_clear(pmd);
+ pmd_t orig_pmd;
+ pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+ page = pmd_page(orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
page_remove_rmap(page);
VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1192,11 @@ static int __split_huge_page_splitting(struct page *page,
struct mm_struct *mm = vma->vm_mm;
pmd_t *pmd;
int ret = 0;
+ /* For mmu_notifiers */
+ const unsigned long mmun_start = address;
+ const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1208,11 @@ static int __split_huge_page_splitting(struct page *page,
* and it won't wait on the anon_vma->root->mutex to
* serialize against split_huge_page*.
*/
- pmdp_splitting_flush_notify(vma, address, pmd);
+ pmdp_splitting_flush(vma, address, pmd);
ret = 1;
}
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
}
@@ -1358,11 +1348,11 @@ static int __split_huge_page_map(struct page *page,
pmd = page_check_address_pmd(page, mm, address,
PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
- pgtable = get_pmd_huge_pte(mm);
+ pgtable = pgtable_trans_huge_withdraw(mm);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0, haddr = address; i < HPAGE_PMD_NR;
- i++, haddr += PAGE_SIZE) {
+ haddr = address;
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1396,7 @@ static int __split_huge_page_map(struct page *page,
* SMP TLB and finally we write the non-huge version
* of the pmd entry with pmd_populate.
*/
- set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
}
@@ -1421,18 +1410,17 @@ static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
int mapcount, mapcount2;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;
BUG_ON(!PageHead(page));
BUG_ON(PageTail(page));
mapcount = 0;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
- if (addr == -EFAULT)
- continue;
mapcount += __split_huge_page_splitting(page, vma, addr);
}
/*
@@ -1453,12 +1441,10 @@ static void __split_huge_page(struct page *page,
__split_huge_page_refcount(page);
mapcount2 = 0;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long addr = vma_address(page, vma);
BUG_ON(is_vma_temporary_stack(vma));
- if (addr == -EFAULT)
- continue;
mapcount2 += __split_huge_page_map(page, vma, addr);
}
if (mapcount != mapcount2)
@@ -1491,12 +1477,13 @@ out:
return ret;
}
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
- VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
+ struct mm_struct *mm = vma->vm_mm;
+
switch (advice) {
case MADV_HUGEPAGE:
/*
@@ -1504,6 +1491,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
*/
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
+ if (mm->def_flags & VM_NOHUGEPAGE)
+ return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
@@ -1655,11 +1644,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- /*
- * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
- * true too, verify it here.
- */
- VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -1833,28 +1818,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
}
}
-static void collapse_huge_page(struct mm_struct *mm,
- unsigned long address,
- struct page **hpage,
- struct vm_area_struct *vma,
- int node)
+static void khugepaged_alloc_sleep(void)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd, _pmd;
- pte_t *pte;
- pgtable_t pgtable;
- struct page *new_page;
- spinlock_t *ptl;
- int isolated;
- unsigned long hstart, hend;
+ wait_event_freezable_timeout(khugepaged_wait, false,
+ msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifndef CONFIG_NUMA
- up_read(&mm->mmap_sem);
- VM_BUG_ON(!*hpage);
- new_page = *hpage;
-#else
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (IS_ERR(*hpage)) {
+ if (!*wait)
+ return false;
+
+ *wait = false;
+ *hpage = NULL;
+ khugepaged_alloc_sleep();
+ } else if (*hpage) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
VM_BUG_ON(*hpage);
/*
* Allocate the page while the vma is still valid and under
@@ -1866,7 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
node, __GFP_OTHER_NODE);
/*
@@ -1874,20 +1866,85 @@ static void collapse_huge_page(struct mm_struct *mm,
* preparation for taking it in write mode.
*/
up_read(&mm->mmap_sem);
- if (unlikely(!new_page)) {
+ if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
- return;
+ return NULL;
}
-#endif
count_vm_event(THP_COLLAPSE_ALLOC);
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-#ifdef CONFIG_NUMA
- put_page(new_page);
+ return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ if (!*wait)
+ return NULL;
+
+ *wait = false;
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+ return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+ if (!*hpage)
+ *hpage = khugepaged_alloc_hugepage(wait);
+
+ if (unlikely(!*hpage))
+ return false;
+
+ return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ int node)
+{
+ up_read(&mm->mmap_sem);
+ VM_BUG_ON(!*hpage);
+ return *hpage;
+}
#endif
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ /* release the mmap_sem read lock. */
+ new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ if (!new_page)
+ return;
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
return;
- }
/*
* Prevent all access to pagetables with the exception of
@@ -1912,11 +1969,7 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out;
if (is_vma_temporary_stack(vma))
goto out;
- /*
- * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
- * true too, verify it here.
- */
- VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
@@ -1936,6 +1989,9 @@ static void collapse_huge_page(struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
+ mmun_start = address;
+ mmun_end = address + HPAGE_PMD_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
spin_lock(&mm->page_table_lock); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1999,9 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+ _pmd = pmdp_clear_flush(vma, address, pmd);
spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2027,6 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
- VM_BUG_ON(page_count(pgtable) != 1);
- VM_BUG_ON(page_mapcount(pgtable) != 0);
_pmd = mk_pmd(new_page, vma->vm_page_prot);
_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2043,12 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache(vma, address, _pmd);
- prepare_pmd_huge_pte(pgtable, mm);
+ update_mmu_cache_pmd(vma, address, pmd);
+ pgtable_trans_huge_deposit(mm, pgtable);
spin_unlock(&mm->page_table_lock);
-#ifndef CONFIG_NUMA
*hpage = NULL;
-#endif
+
khugepaged_pages_collapsed++;
out_up_write:
up_write(&mm->mmap_sem);
@@ -2002,9 +2056,6 @@ out_up_write:
out:
mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
- put_page(new_page);
-#endif
goto out_up_write;
}
@@ -2154,12 +2205,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
goto skip;
if (is_vma_temporary_stack(vma))
goto skip;
- /*
- * If is_pfn_mapping() is true is_learn_pfn_mapping()
- * must be true too, verify it here.
- */
- VM_BUG_ON(is_linear_pfn_mapping(vma) ||
- vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON(vma->vm_flags & VM_NO_THP);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2280,23 @@ static int khugepaged_has_work(void)
static int khugepaged_wait_event(void)
{
return !list_empty(&khugepaged_scan.mm_head) ||
- !khugepaged_enabled();
+ kthread_should_stop();
}
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
{
+ struct page *hpage = NULL;
unsigned int progress = 0, pass_through_head = 0;
unsigned int pages = khugepaged_pages_to_scan;
+ bool wait = true;
barrier(); /* write khugepaged_pages_to_scan to local stack */
while (progress < pages) {
- cond_resched();
-
-#ifndef CONFIG_NUMA
- if (!*hpage) {
- *hpage = alloc_hugepage(khugepaged_defrag());
- if (unlikely(!*hpage)) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- break;
- }
- count_vm_event(THP_COLLAPSE_ALLOC);
- }
-#else
- if (IS_ERR(*hpage))
+ if (!khugepaged_prealloc_page(&hpage, &wait))
break;
-#endif
+
+ cond_resched();
if (unlikely(kthread_should_stop() || freezing(current)))
break;
@@ -2270,73 +2307,32 @@ static void khugepaged_do_scan(struct page **hpage)
if (khugepaged_has_work() &&
pass_through_head < 2)
progress += khugepaged_scan_mm_slot(pages - progress,
- hpage);
+ &hpage);
else
progress = pages;
spin_unlock(&khugepaged_mm_lock);
}
-}
-static void khugepaged_alloc_sleep(void)
-{
- wait_event_freezable_timeout(khugepaged_wait, false,
- msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+ if (!IS_ERR_OR_NULL(hpage))
+ put_page(hpage);
}
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(void)
+static void khugepaged_wait_work(void)
{
- struct page *hpage;
-
- do {
- hpage = alloc_hugepage(khugepaged_defrag());
- if (!hpage) {
- count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
- khugepaged_alloc_sleep();
- } else
- count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) &&
- likely(khugepaged_enabled()));
- return hpage;
-}
-#endif
+ try_to_freeze();
-static void khugepaged_loop(void)
-{
- struct page *hpage;
+ if (khugepaged_has_work()) {
+ if (!khugepaged_scan_sleep_millisecs)
+ return;
-#ifdef CONFIG_NUMA
- hpage = NULL;
-#endif
- while (likely(khugepaged_enabled())) {
-#ifndef CONFIG_NUMA
- hpage = khugepaged_alloc_hugepage();
- if (unlikely(!hpage))
- break;
-#else
- if (IS_ERR(hpage)) {
- khugepaged_alloc_sleep();
- hpage = NULL;
- }
-#endif
-
- khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
- if (hpage)
- put_page(hpage);
-#endif
- try_to_freeze();
- if (unlikely(kthread_should_stop()))
- break;
- if (khugepaged_has_work()) {
- if (!khugepaged_scan_sleep_millisecs)
- continue;
- wait_event_freezable_timeout(khugepaged_wait, false,
- msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
- } else if (khugepaged_enabled())
- wait_event_freezable(khugepaged_wait,
- khugepaged_wait_event());
+ wait_event_freezable_timeout(khugepaged_wait,
+ kthread_should_stop(),
+ msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+ return;
}
+
+ if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
static int khugepaged(void *none)
@@ -2346,20 +2342,9 @@ static int khugepaged(void *none)
set_freezable();
set_user_nice(current, 19);
- /* serialize with start_khugepaged() */
- mutex_lock(&khugepaged_mutex);
-
- for (;;) {
- mutex_unlock(&khugepaged_mutex);
- VM_BUG_ON(khugepaged_thread != current);
- khugepaged_loop();
- VM_BUG_ON(khugepaged_thread != current);
-
- mutex_lock(&khugepaged_mutex);
- if (!khugepaged_enabled())
- break;
- if (unlikely(kthread_should_stop()))
- break;
+ while (!kthread_should_stop()) {
+ khugepaged_do_scan();
+ khugepaged_wait_work();
}
spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2353,6 @@ static int khugepaged(void *none)
if (mm_slot)
collect_mm_slot(mm_slot);
spin_unlock(&khugepaged_mm_lock);
-
- khugepaged_thread = NULL;
- mutex_unlock(&khugepaged_mutex);
-
return 0;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
-#include <linux/hugetlb_cgroup.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
} else {
+ arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
}
}
+/*
+ * PageHuge() only returns true for hugetlbfs pages, but not for normal or
+ * transparent huge pages. See the PageTransHuge() documentation for more
+ * details.
+ */
int PageHuge(struct page *page)
{
compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ const unsigned long mmun_start = start; /* For mmu_notifiers */
+ const unsigned long mmun_end = end; /* For mmu_notifiers */
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
tlb_start_vma(tlb, vma);
- mmu_notifier_invalidate_range_start(mm, start, end);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
if (address < end && !ref_page)
goto again;
}
- mmu_notifier_invalidate_range_end(mm, start, end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
tlb_end_vma(tlb, vma);
}
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct vm_area_struct *iter_vma;
struct address_space *mapping;
- struct prio_tree_iter iter;
pgoff_t pgoff;
/*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = vma_hugecache_offset(h, vma, address);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* __unmap_hugepage_range() is called as the lock is already held
*/
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
/* Do not unmap the current VMA */
if (iter_vma == vma)
continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *old_page, *new_page;
int avoidcopy;
int outside_reserve = 0;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_page = pte_page(pte);
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
pages_per_huge_page(h));
__SetPageUptodate(new_page);
+ mmun_start = address & huge_page_mask(h);
+ mmun_end = mmun_start + huge_page_size(h);
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
* Retake the page_table_lock to check for racing updates
* before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW */
- mmu_notifier_invalidate_range_start(mm,
- address & huge_page_mask(h),
- (address & huge_page_mask(h)) + huge_page_size(h));
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
- mmu_notifier_invalidate_range_end(mm,
- address & huge_page_mask(h),
- (address & huge_page_mask(h)) + huge_page_size(h));
}
+ spin_unlock(&mm->page_table_lock);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ /* Caller expects lock to be held */
+ spin_lock(&mm->page_table_lock);
page_cache_release(new_page);
page_cache_release(old_page);
return 0;
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,26 +118,27 @@ struct compact_control {
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
- unsigned long start_free_pfn; /* where we started the search */
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
- bool wrapped; /* Order > 0 compactions are
- incremental, once free_pfn
- and migrate_pfn meet, we restart
- from the top of the zone;
- remember we wrapped around. */
+ bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool finished_update_free; /* True when the zone cached pfns are
+ * no longer being updated
+ */
+ bool finished_update_migrate;
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
- bool *contended; /* True if a lock was contended */
+ bool contended; /* True if a lock was contended */
+ struct page **page; /* Page captured of requested size */
};
unsigned long
-isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+isolate_freepages_range(struct compact_control *cc,
+ unsigned long start_pfn, unsigned long end_pfn);
unsigned long
isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn);
+ unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
#endif
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
/*
- * Called only in fault path via page_evictable() for a new page
- * to determine if it's being mapped into a LOCKED vma.
- * If so, mark page as mlocked.
+ * Called only in fault path, to determine if a new page is being
+ * mapped into a LOCKED vma. If it is, mark page as mlocked.
*/
static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
struct page *page)
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
return 0;
if (!TestSetPageMlocked(page)) {
- inc_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGMLOCKED);
}
return 1;
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
* If called for a page that is still mapped by mlocked vmas, all we do
* is revert to lazy LRU behaviour -- semantics are not broken.
*/
-extern void __clear_page_mlock(struct page *page);
-static inline void clear_page_mlock(struct page *page)
-{
- if (unlikely(TestClearPageMlocked(page)))
- __clear_page_mlock(page);
-}
+extern void clear_page_mlock(struct page *page);
/*
* mlock_migrate_page - called only from migrate_page_copy() to
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
#define ZONE_RECLAIM_SUCCESS 1
-#endif
extern int hwpoison_filter(struct page *p);
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list);
+/* The ALLOC_WMARK bits are used as an index to zone->watermark */
+#define ALLOC_WMARK_MIN WMARK_MIN
+#define ALLOC_WMARK_LOW WMARK_LOW
+#define ALLOC_WMARK_HIGH WMARK_HIGH
+#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
+
+/* Mask to get the watermark bits */
+#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
+
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+
+#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
+/*
+ * mm/interval_tree.c - interval tree for mapping->i_mmap
+ *
+ * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
+ *
+ * This file is released under the GPL v2.
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/rmap.h>
+#include <linux/interval_tree_generic.h>
+
+static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff;
+}
+
+static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
+{
+ return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
+ unsigned long, shared.linear.rb_subtree_last,
+ vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
+
+/* Insert node immediately after prev in the interval tree */
+void vma_interval_tree_insert_after(struct vm_area_struct *node,
+ struct vm_area_struct *prev,
+ struct rb_root *root)
+{
+ struct rb_node **link;
+ struct vm_area_struct *parent;
+ unsigned long last = vma_last_pgoff(node);
+
+ VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+
+ if (!prev->shared.linear.rb.rb_right) {
+ parent = prev;
+ link = &prev->shared.linear.rb.rb_right;
+ } else {
+ parent = rb_entry(prev->shared.linear.rb.rb_right,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ while (parent->shared.linear.rb.rb_left) {
+ parent = rb_entry(parent->shared.linear.rb.rb_left,
+ struct vm_area_struct, shared.linear.rb);
+ if (parent->shared.linear.rb_subtree_last < last)
+ parent->shared.linear.rb_subtree_last = last;
+ }
+ link = &parent->shared.linear.rb.rb_left;
+ }
+
+ node->shared.linear.rb_subtree_last = last;
+ rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
+ rb_insert_augmented(&node->shared.linear.rb, root,
+ &vma_interval_tree_augment);
+}
+
+static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_start_pgoff(avc->vma);
+}
+
+static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
+{
+ return vma_last_pgoff(avc->vma);
+}
+
+INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
+ avc_start_pgoff, avc_last_pgoff,
+ static inline, __anon_vma_interval_tree)
+
+void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+#ifdef CONFIG_DEBUG_VM_RB
+ node->cached_vma_start = avc_start_pgoff(node);
+ node->cached_vma_last = avc_last_pgoff(node);
+#endif
+ __anon_vma_interval_tree_insert(node, root);
+}
+
+void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
+ struct rb_root *root)
+{
+ __anon_vma_interval_tree_remove(node, root);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_first(struct rb_root *root,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_first(root, first, last);
+}
+
+struct anon_vma_chain *
+anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
+ unsigned long first, unsigned long last)
+{
+ return __anon_vma_interval_tree_iter_next(node, first, last);
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
+{
+ WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
+ WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 45eb6217bf38..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
* - kmemleak_lock (rwlock): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
- * blocks. The object_tree_root is a priority search tree used to look-up
+ * blocks. The object_tree_root is a red black tree used to look-up
* metadata based on a pointer to the corresponding memory block. The
* kmemleak_object structures are added to the object_list and
* object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/kthread.h>
-#include <linux/prio_tree.h>
+#include <linux/rbtree.h>
#include <linux/fs.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
* Structure holding the metadata for each allocated memory block.
* Modifications to such objects should be made while holding the
* object->lock. Insertions or deletions from object_list, gray_list or
- * tree_node are already protected by the corresponding locks or mutex (see
+ * rb_node are already protected by the corresponding locks or mutex (see
* the notes on locking above). These objects are reference-counted
* (use_count) and freed using the RCU mechanism.
*/
@@ -141,7 +141,7 @@ struct kmemleak_object {
unsigned long flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
- struct prio_tree_node tree_node;
+ struct rb_node rb_node;
struct rcu_head rcu; /* object_list lockless traversal */
/* object usage count; object freed when use_count == 0 */
atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
-/* prio search tree for object boundaries */
-static struct prio_tree_root object_tree_root;
-/* rw_lock protecting the access to object_list and prio_tree_root */
+/* search tree for object boundaries */
+static struct rb_root object_tree_root = RB_ROOT;
+/* rw_lock protecting the access to object_list and object_tree_root */
static DEFINE_RWLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
trace.entries = object->trace;
pr_notice("Object 0x%08lx (size %zu):\n",
- object->tree_node.start, object->size);
+ object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
object->comm, object->pid, object->jiffies);
pr_notice(" min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
}
/*
- * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * Look-up a memory block metadata (kmemleak_object) in the object search
* tree based on a pointer value. If alias is 0, only values pointing to the
* beginning of the memory block are allowed. The kmemleak_lock must be held
* when calling this function.
*/
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
{
- struct prio_tree_node *node;
- struct prio_tree_iter iter;
- struct kmemleak_object *object;
-
- prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
- node = prio_tree_next(&iter);
- if (node) {
- object = prio_tree_entry(node, struct kmemleak_object,
- tree_node);
- if (!alias && object->pointer != ptr) {
+ struct rb_node *rb = object_tree_root.rb_node;
+
+ while (rb) {
+ struct kmemleak_object *object =
+ rb_entry(rb, struct kmemleak_object, rb_node);
+ if (ptr < object->pointer)
+ rb = object->rb_node.rb_left;
+ else if (object->pointer + object->size <= ptr)
+ rb = object->rb_node.rb_right;
+ else if (object->pointer == ptr || alias)
+ return object;
+ else {
kmemleak_warn("Found object by alias at 0x%08lx\n",
ptr);
dump_object_info(object);
- object = NULL;
+ break;
}
- } else
- object = NULL;
-
- return object;
+ }
+ return NULL;
}
/*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
}
/*
- * Look up an object in the prio search tree and increase its use_count.
+ * Look up an object in the object search tree and increase its use_count.
*/
static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
{
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
int min_count, gfp_t gfp)
{
unsigned long flags;
- struct kmemleak_object *object;
- struct prio_tree_node *node;
+ struct kmemleak_object *object, *parent;
+ struct rb_node **link, *rb_parent;
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- INIT_PRIO_TREE_NODE(&object->tree_node);
- object->tree_node.start = ptr;
- object->tree_node.last = ptr + size - 1;
-
write_lock_irqsave(&kmemleak_lock, flags);
min_addr = min(min_addr, ptr);
max_addr = max(max_addr, ptr + size);
- node = prio_tree_insert(&object_tree_root, &object->tree_node);
- /*
- * The code calling the kernel does not yet have the pointer to the
- * memory block to be able to free it. However, we still hold the
- * kmemleak_lock here in case parts of the kernel started freeing
- * random memory blocks.
- */
- if (node != &object->tree_node) {
- kmemleak_stop("Cannot insert 0x%lx into the object search tree "
- "(already existing)\n", ptr);
- object = lookup_object(ptr, 1);
- spin_lock(&object->lock);
- dump_object_info(object);
- spin_unlock(&object->lock);
-
- goto out;
+ link = &object_tree_root.rb_node;
+ rb_parent = NULL;
+ while (*link) {
+ rb_parent = *link;
+ parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
+ if (ptr + size <= parent->pointer)
+ link = &parent->rb_node.rb_left;
+ else if (parent->pointer + parent->size <= ptr)
+ link = &parent->rb_node.rb_right;
+ else {
+ kmemleak_stop("Cannot insert 0x%lx into the object "
+ "search tree (overlaps existing)\n",
+ ptr);
+ kmem_cache_free(object_cache, object);
+ object = parent;
+ spin_lock(&object->lock);
+ dump_object_info(object);
+ spin_unlock(&object->lock);
+ goto out;
+ }
}
+ rb_link_node(&object->rb_node, rb_parent, link);
+ rb_insert_color(&object->rb_node, &object_tree_root);
+
list_add_tail_rcu(&object->object_list, &object_list);
out:
write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
unsigned long flags;
write_lock_irqsave(&kmemleak_lock, flags);
- prio_tree_remove(&object_tree_root, &object->tree_node);
+ rb_erase(&object->rb_node, &object_tree_root);
list_del_rcu(&object->object_list);
write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct kmemleak_object *prev_obj = v;
struct kmemleak_object *next_obj = NULL;
- struct list_head *n = &prev_obj->object_list;
+ struct kmemleak_object *obj = prev_obj;
++(*pos);
- list_for_each_continue_rcu(n, &object_list) {
- struct kmemleak_object *obj =
- list_entry(n, struct kmemleak_object, object_list);
+ list_for_each_entry_continue_rcu(obj, &object_list, object_list) {
if (get_object(obj)) {
next_obj = obj;
break;
@@ -1768,7 +1769,6 @@ void __init kmemleak_init(void)
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- INIT_PRIO_TREE_ROOT(&object_tree_root);
if (crt_early_log >= ARRAY_SIZE(early_log))
pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
spinlock_t *ptl;
int swapped;
int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
goto out;
BUG_ON(PageTransCompound(page));
+
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
- goto out;
+ goto out_mn;
if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
out_unlock:
pte_unmap_unlock(ptep, ptl);
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd_present(*pmd))
goto out;
+ mmun_start = addr;
+ mmun_end = addr + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
pte_unmap_unlock(ptep, ptl);
- goto out;
+ goto out_mn;
}
get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pte_unmap_unlock(ptep, ptl);
err = 0;
+out_mn:
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return err;
}
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
*/
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
- VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
+ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
return 0; /* just ignore the advice */
+#ifdef VM_SAO
+ if (*vm_flags & VM_SAO)
+ return 0;
+#endif
+
if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
err = __ksm_enter(mm);
if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
SetPageSwapBacked(new_page);
__set_page_locked(new_page);
- if (page_evictable(new_page, vma))
+ if (!mlocked_vma_newpage(vma, new_page))
lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
else
add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
struct vm_area_struct *vma;
anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
struct vm_area_struct *vma;
anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
struct vm_area_struct *vma;
anon_vma_lock(anon_vma);
- list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+ 0, ULONG_MAX) {
vma = vmac->vma;
if (rmap_item->address < vma->vm_start ||
rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
new_flags &= ~VM_DONTCOPY;
break;
case MADV_DONTDUMP:
- new_flags |= VM_NODUMP;
+ new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- new_flags &= ~VM_NODUMP;
+ if (new_flags & VM_SPECIAL) {
+ error = -EINVAL;
+ goto out;
+ }
+ new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..625905523c2a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
/* inline so we don't get a warning when pr_debug is compiled out */
-static inline const char *memblock_type_name(struct memblock_type *type)
+static __init_memblock const char *
+memblock_type_name(struct memblock_type *type)
{
if (type == &memblock.memory)
return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
return ret;
for (i = start_rgn; i < end_rgn; i++)
- type->regions[i].nid = nid;
+ memblock_set_region_node(&type->regions[i], nid);
memblock_merge_regions(type);
return 0;
@@ -929,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
}
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+ int i;
+ phys_addr_t start, end, orig_start, orig_end;
+ struct memblock_type *mem = &memblock.memory;
+
+ for (i = 0; i < mem->cnt; i++) {
+ orig_start = mem->regions[i].base;
+ orig_end = mem->regions[i].base + mem->regions[i].size;
+ start = round_up(orig_start, align);
+ end = round_down(orig_end, align);
+
+ if (start == orig_start && end == orig_end)
+ continue;
+
+ if (start < end) {
+ mem->regions[i].base = start;
+ mem->regions[i].size = end - start;
+ } else {
+ memblock_remove_region(mem, i);
+ i--;
+ }
+ }
+}
void __init_memblock memblock_set_current_limit(phys_addr_t limit)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 795e525afaba..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
#include <linux/oom.h>
#include "internal.h"
#include <net/sock.h>
+#include <net/ip.h>
#include <net/tcp_memcontrol.h>
#include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
-#ifdef CONFIG_INET
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
#endif
};
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
return container_of(s, struct mem_cgroup, css);
}
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
+{
+ return (memcg == root_mem_cgroup);
+}
+
/* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_MEMCG_KMEM
-#include <net/sock.h>
-#include <net/ip.h>
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
}
}
-#ifdef CONFIG_INET
struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
{
if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
-#endif /* CONFIG_INET */
-#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
-{
- return (memcg == root_mem_cgroup);
-}
-
void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
{
struct mem_cgroup *memcg;
@@ -4973,6 +4967,13 @@ mem_cgroup_create(struct cgroup *cont)
} else {
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
+ /*
+ * Deeper hierachy with use_hierarchy == false doesn't make
+ * much sense so let cgroup subsystem know about this
+ * unfortunate state in our controller.
+ */
+ if (parent && parent != root_mem_cgroup)
+ mem_cgroup_subsys.broken_hierarchy = true;
}
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct anon_vma *av;
+ pgoff_t pgoff;
av = page_lock_anon_vma(page);
if (av == NULL) /* Not actually mapped anymore */
return;
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
read_lock(&tasklist_lock);
for_each_process (tsk) {
struct anon_vma_chain *vmac;
if (!task_early_kill(tsk))
continue;
- list_for_each_entry(vmac, &av->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(vmac, &av->rb_root,
+ pgoff, pgoff) {
vma = vmac->vma;
if (!page_mapped_in_vma(page, vma))
continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
{
struct vm_area_struct *vma;
struct task_struct *tsk;
- struct prio_tree_iter iter;
struct address_space *mapping = page->mapping;
mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
if (!task_early_kill(tsk))
continue;
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
pgoff) {
/*
* Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
add_taint(TAINT_BAD_PAGE);
}
-static inline int is_cow_mapping(vm_flags_t flags)
+static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ bool is_cow;
int ret;
/*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
+ if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
+ VM_PFNMAP | VM_MIXEDMAP))) {
if (!vma->anon_vma)
return 0;
}
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
- if (unlikely(is_pfn_mapping(vma))) {
+ if (unlikely(vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
- ret = track_pfn_vma_copy(vma);
+ ret = track_pfn_copy(vma);
if (ret)
return ret;
}
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
- if (is_cow_mapping(vma->vm_flags))
- mmu_notifier_invalidate_range_start(src_mm, addr, end);
+ is_cow = is_cow_mapping(vma->vm_flags);
+ mmun_start = addr;
+ mmun_end = end;
+ if (is_cow)
+ mmu_notifier_invalidate_range_start(src_mm, mmun_start,
+ mmun_end);
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
- if (is_cow_mapping(vma->vm_flags))
- mmu_notifier_invalidate_range_end(src_mm,
- vma->vm_start, end);
+ if (is_cow)
+ mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
return ret;
}
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(is_pfn_mapping(vma)))
- untrack_pfn_vma(vma, 0, 0);
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn(vma, 0, 0);
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
spin_unlock(&mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
- page = follow_trans_huge_pmd(mm, address,
+ page = follow_trans_huge_pmd(vma, address,
pmd, flags);
spin_unlock(&mm->page_table_lock);
goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
if (page->mapping && trylock_page(page)) {
lru_add_drain(); /* push cached pages to LRU */
/*
- * Because we lock page here and migration is
- * blocked by the pte's page reference, we need
- * only check for file-cache page truncation.
+ * Because we lock page here, and migration is
+ * blocked by the pte's page reference, and we
+ * know the page is still mapped, we don't even
+ * need to check for file-cache page truncation.
*/
- if (page->mapping)
- mlock_vma_page(page);
+ mlock_vma_page(page);
unlock_page(page);
}
}
@@ -2085,6 +2092,11 @@ out:
* ask for a shared writable mapping!
*
* The page does not need to be reserved.
+ *
+ * Usually this function is called from f_op->mmap() handler
+ * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * Caller must set VM_MIXEDMAP on vma if it wants to call this
+ * function from other places, for example from page-fault handler.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
return -EFAULT;
if (!page_count(page))
return -EINVAL;
- vma->vm_flags |= VM_INSERTPAGE;
+ if (!(vma->vm_flags & VM_MIXEDMAP)) {
+ BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(vma->vm_flags & VM_PFNMAP);
+ vma->vm_flags |= VM_MIXEDMAP;
+ }
return insert_page(vma, addr, page, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
* @addr: target user address of this page
* @pfn: source kernel pfn
*
- * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
* they've allocated into a user vma. Same comments apply.
*
* This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
+ if (track_pfn_insert(vma, &pgprot, pfn))
return -EINVAL;
ret = insert_pfn(vma, addr, pfn, pgprot);
- if (ret)
- untrack_pfn_vma(vma, pfn, PAGE_SIZE);
-
return ret;
}
EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
* rest of the world about it:
* VM_IO tells people not to look at these pages
* (accesses can have side effects).
- * VM_RESERVED is specified all over the place, because
- * in 2.4 it kept swapout's vma scan off this vma; but
- * in 2.6 the LRU scan won't even find its pages, so this
- * flag means no more than count its pages in reserved_vm,
- * and omit it from core dump, even when VM_IO turned off.
* VM_PFNMAP tells the core MM that the base pages are just
* raw PFN mappings, and do not have a "struct page" associated
* with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
*
* There's a horrible special case to handle copy-on-write
* behaviour that some programs depend on. We mark the "original"
* un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
*/
- if (addr == vma->vm_start && end == vma->vm_end) {
+ if (is_cow_mapping(vma->vm_flags)) {
+ if (addr != vma->vm_start || end != vma->vm_end)
+ return -EINVAL;
vma->vm_pgoff = pfn;
- vma->vm_flags |= VM_PFN_AT_MMAP;
- } else if (is_cow_mapping(vma->vm_flags))
- return -EINVAL;
-
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ }
- err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
- if (err) {
- /*
- * To indicate that track_pfn related cleanup is not
- * needed from higher level routine calling unmap_vmas
- */
- vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
- vma->vm_flags &= ~VM_PFN_AT_MMAP;
+ err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+ if (err)
return -EINVAL;
- }
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
} while (pgd++, addr = next, addr != end);
if (err)
- untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
+ untrack_pfn(vma, pfn, PAGE_ALIGN(size));
return err;
}
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl, pte_t orig_pte)
__releases(ptl)
{
- struct page *old_page, *new_page;
+ struct page *old_page, *new_page = NULL;
pte_t entry;
int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
+ bool mmun_called = false; /* For mmu_notifiers */
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
+ mmun_start = address & PAGE_MASK;
+ mmun_end = (address & PAGE_MASK) + PAGE_SIZE;
+ mmun_called = true;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
/*
* Re-check the pte - we dropped the lock
*/
@@ -2764,6 +2778,8 @@ gotten:
page_cache_release(new_page);
unlock:
pte_unmap_unlock(page_table, ptl);
+ if (mmun_called)
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}
-static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
+static inline void unmap_mapping_range_tree(struct rb_root *root,
struct zap_details *details)
{
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea;
- vma_prio_tree_foreach(vma, &iter, root,
+ vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point.
*/
- list_for_each_entry(vma, head, shared.vm_set.list) {
+ list_for_each_entry(vma, head, shared.nonlinear) {
details->nonlinear_vma = vma;
unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
}
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
mutex_lock(&mapping->i_mmap_mutex);
- if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
void __ref put_page_bootmem(struct page *page)
{
unsigned long type;
+ struct zone *zone;
type = (unsigned long) page->lru.next;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
__free_pages_bootmem(page, 0);
+
+ zone = page_zone(page);
+ zone_span_writelock(zone);
+ zone->present_pages++;
+ zone_span_writeunlock(zone);
+ totalram_pages++;
}
}
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
+ release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
- release_mem_region(pfn << PAGE_SHIFT,
- PAGES_PER_SECTION << PAGE_SHIFT);
ret = __remove_section(zone, __pfn_to_section(pfn));
if (ret)
break;
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
return 0;
}
-static struct page *
-hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
-{
- /* This should be improooooved!! */
- return alloc_page(GFP_HIGHUSER_MOVABLE);
-}
-
#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
putback_lru_pages(&source);
goto out;
}
- /* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+
+ /*
+ * alloc_migrate_target should be improooooved!!
+ * migrate_pages returns # of failed pages.
+ */
+ ret = migrate_pages(&source, alloc_migrate_target, 0,
true, MIGRATE_SYNC);
if (ret)
putback_lru_pages(&source);
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
return offlined;
}
-static int __ref offline_pages(unsigned long start_pfn,
+static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn, unsigned long timeout)
{
unsigned long pfn, nr_pages, expire;
@@ -970,8 +974,13 @@ repeat:
init_per_zone_wmark_min();
- if (!populated_zone(zone))
+ if (!populated_zone(zone)) {
zone_pcp_reset(zone);
+ mutex_lock(&zonelists_mutex);
+ build_all_zonelists(NULL, NULL);
+ mutex_unlock(&zonelists_mutex);
+ } else
+ zone_pcp_update(zone);
if (!node_present_pages(node)) {
node_clear_state(node, N_HIGH_MEMORY);
@@ -998,15 +1007,55 @@ out:
return ret;
}
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+}
+
int remove_memory(u64 start, u64 size)
{
+ struct memory_block *mem = NULL;
+ struct mem_section *section;
unsigned long start_pfn, end_pfn;
+ unsigned long pfn, section_nr;
+ int ret;
start_pfn = PFN_DOWN(start);
end_pfn = start_pfn + PFN_DOWN(size);
- return offline_pages(start_pfn, end_pfn, 120 * HZ);
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ section_nr = pfn_to_section_nr(pfn);
+ if (!present_section_nr(section_nr))
+ continue;
+
+ section = __nr_to_section(section_nr);
+ /* same memblock? */
+ if (mem)
+ if ((section_nr >= mem->start_section_nr) &&
+ (section_nr <= mem->end_section_nr))
+ continue;
+
+ mem = find_memory_block_hinted(section, mem);
+ if (!mem)
+ continue;
+
+ ret = offline_memory_block(mem);
+ if (ret) {
+ kobject_put(&mem->dev.kobj);
+ return ret;
+ }
+ }
+
+ if (mem)
+ kobject_put(&mem->dev.kobj);
+
+ return 0;
}
#else
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ return -EINVAL;
+}
int remove_memory(u64 start, u64 size)
{
return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..d04a8a54c294 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return first;
}
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
+{
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new);
+ if (err)
+ goto err_out;
+ }
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_sem */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
+ return err;
+}
+
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
-
- /*
- * Apply policy to a single VMA. The reference counting of
- * policy for vma_policy linkages has already been handled by
- * vma_merge and split_vma as necessary. If this is a shared
- * policy then ->set_policy will increment the reference count
- * for an sp node.
- */
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
- if (vma->vm_ops && vma->vm_ops->set_policy) {
- err = vma->vm_ops->set_policy(vma, new_pol);
- if (err)
- goto out;
- }
+ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
}
out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
- struct vm_area_struct *vma;
nodes_clear(nmask);
node_set(source, nmask);
- vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+ /*
+ * This does not "check" the range but isolates all pages that
+ * need migration. Between passing in the full user address
+ * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
+ */
+ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
+ check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (IS_ERR(vma))
- return PTR_ERR(vma);
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1511,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
*
* Returns effective policy for a VMA at specified address.
* Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies
- * are protected by the task's mmap_sem, which must be held for read by
- * the caller.
+ * Current or other task's task mempolicy and non-shared vma policies must be
+ * protected by task_lock(task) by the caller.
* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
* count--added by the get_policy() vm_op, as appropriate--to protect against
* freeing by another task. It is the caller's responsibility to free the
@@ -1530,8 +1554,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
addr);
if (vpol)
pol = vpol;
- } else if (vma->vm_policy)
+ } else if (vma->vm_policy) {
pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
}
if (!pol)
pol = &default_policy;
@@ -2061,7 +2095,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2125,36 +2159,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ mutex_unlock(&sp->mutex);
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_free(n);
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ struct sp_node *n;
+ struct mempolicy *newpol;
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+
n->start = start;
n->end = end;
- mpol_get(pol);
- pol->flags |= MPOL_F_SHARED; /* for unref */
- n->policy = pol;
+ n->policy = newpol;
+
return n;
}
@@ -2162,10 +2210,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2 = NULL;
+ struct sp_node *n;
+ int ret = 0;
-restart:
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2178,16 +2226,14 @@ restart:
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
+ struct sp_node *new2;
+ new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
- spin_unlock(&sp->lock);
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2)
- return -ENOMEM;
- goto restart;
+ ret = -ENOMEM;
+ goto out;
}
n->end = start;
sp_insert(sp, new2);
- new2 = NULL;
break;
} else
n->end = start;
@@ -2198,12 +2244,9 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
- if (new2) {
- mpol_put(new2->policy);
- kmem_cache_free(sn_cache, new2);
- }
- return 0;
+out:
+ mutex_unlock(&sp->mutex);
+ return ret;
}
/**
@@ -2221,7 +2264,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ mutex_init(&sp->mutex);
if (mpol) {
struct vm_area_struct pvma;
@@ -2275,7 +2318,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
- kmem_cache_free(sn_cache, new);
+ sp_free(new);
return err;
}
@@ -2287,16 +2330,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ mutex_lock(&p->mutex);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ mutex_unlock(&p->mutex);
}
/* assumes fs == KERNEL_DS */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
/*
* LRU accounting for clear_page_mlock()
*/
-void __clear_page_mlock(struct page *page)
+void clear_page_mlock(struct page *page)
{
- VM_BUG_ON(!PageLocked(page));
-
- if (!page->mapping) { /* truncated ? */
+ if (!TestClearPageMlocked(page))
return;
- }
- dec_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ -hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGCLEARED);
if (!isolate_lru_page(page)) {
putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
- inc_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ hpage_nr_pages(page));
count_vm_event(UNEVICTABLE_PGMLOCKED);
if (!isolate_lru_page(page))
putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page)) {
- dec_zone_page_state(page, NR_MLOCK);
+ mod_zone_page_state(page_zone(page), NR_MLOCK,
+ -hpage_nr_pages(page));
if (!isolate_lru_page(page)) {
int ret = SWAP_AGAIN;
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_IO | VM_PFNMAP))
goto no_mlock;
- if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+ if (!((vma->vm_flags & VM_DONTEXPAND) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))) {
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
if (page && !IS_ERR(page)) {
lock_page(page);
- /*
- * Like in __mlock_vma_pages_range(),
- * because we lock page here and migration is
- * blocked by the elevated reference, we need
- * only check for file-cache page truncation.
- */
- if (page->mapping)
- munlock_vma_page(page);
+ munlock_vma_page(page);
unlock_page(page);
put_page(page);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4e..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
-/*
- * WARNING: the debugging will use recursive algorithms so never enable this
- * unless you know what you are doing.
- */
-#undef DEBUG_MM_RB
-
/* description of effects of mapping type and prot in current implementation.
* this is due to the limited x86 page protection hardware. The expected
* behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
flush_dcache_mmap_lock(mapping);
if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.vm_set.list);
+ list_del_init(&vma->shared.nonlinear);
else
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
/*
- * Unlink a file-based vm structure from its prio_tree, to hide
+ * Unlink a file-based vm structure from its interval tree, to hide
* vma from rmap and vmtruncate before freeing its page tables.
*/
void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
might_sleep();
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
- }
mpol_put(vma_policy(vma));
kmem_cache_free(vm_area_cachep, vma);
return next;
@@ -306,7 +297,7 @@ out:
return retval;
}
-#ifdef DEBUG_MM_RB
+#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root)
{
int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
{
int bug = 0;
int i = 0;
- struct vm_area_struct *tmp = mm->mmap;
- while (tmp) {
- tmp = tmp->vm_next;
+ struct vm_area_struct *vma = mm->mmap;
+ while (vma) {
+ struct anon_vma_chain *avc;
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ vma = vma->vm_next;
i++;
}
if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
#define validate_mm(mm) do { } while (0)
#endif
-static struct vm_area_struct *
-find_vma_prepare(struct mm_struct *mm, unsigned long addr,
- struct vm_area_struct **pprev, struct rb_node ***rb_link,
- struct rb_node ** rb_parent)
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_sem and by
+ * the root anon_vma's mutex.
+ */
+static inline void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
- struct vm_area_struct * vma;
- struct rb_node ** __rb_link, * __rb_parent, * rb_prev;
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+static inline void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+static int find_vma_links(struct mm_struct *mm, unsigned long addr,
+ unsigned long end, struct vm_area_struct **pprev,
+ struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+ struct rb_node **__rb_link, *__rb_parent, *rb_prev;
__rb_link = &mm->mm_rb.rb_node;
rb_prev = __rb_parent = NULL;
- vma = NULL;
while (*__rb_link) {
struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
+ /* Fail if an existing vma overlaps the area */
+ if (vma_tmp->vm_start < end)
+ return -ENOMEM;
__rb_link = &__rb_parent->rb_left;
} else {
rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
*rb_link = __rb_link;
*rb_parent = __rb_parent;
- return vma;
+ return 0;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
if (unlikely(vma->vm_flags & VM_NONLINEAR))
vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
else
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Helper for vma_adjust() in the split_vma insert case: insert a vma into the
- * mm's list and rbtree. It has already been inserted into the prio_tree.
+ * mm's list and rbtree. It has already been inserted into the interval tree.
*/
static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *__vma, *prev;
+ struct vm_area_struct *prev;
struct rb_node **rb_link, *rb_parent;
- __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
- BUG_ON(__vma && __vma->vm_start < vma->vm_end);
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
+ BUG();
__vma_link(mm, vma, prev, rb_link, rb_parent);
mm->map_count++;
}
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
struct vm_area_struct *next = vma->vm_next;
struct vm_area_struct *importer = NULL;
struct address_space *mapping = NULL;
- struct prio_tree_root *root = NULL;
+ struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
struct file *file = vma->vm_file;
long adjust_next = 0;
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end);
mutex_lock(&mapping->i_mmap_mutex);
if (insert) {
/*
- * Put into prio_tree now, so instantiated pages
+ * Put into interval tree now, so instantiated pages
* are visible to arm/parisc __flush_dcache_page
* throughout; but we cannot insert into address
* space until vma start or end is updated.
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end);
vma_adjust_trans_huge(vma, start, end, adjust_next);
- /*
- * When changing only vma->vm_end, we don't really need anon_vma
- * lock. This is a fairly rare case by itself, but the anon_vma
- * lock may be shared between many sibling processes. Skipping
- * the lock for brk adjustments makes a difference sometimes.
- */
- if (vma->anon_vma && (importer || start != vma->vm_start)) {
- anon_vma = vma->anon_vma;
+ anon_vma = vma->anon_vma;
+ if (!anon_vma && adjust_next)
+ anon_vma = next->anon_vma;
+ if (anon_vma) {
+ VM_BUG_ON(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma);
anon_vma_lock(anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_pre_update_vma(next);
}
if (root) {
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, root);
+ vma_interval_tree_remove(vma, root);
if (adjust_next)
- vma_prio_tree_remove(next, root);
+ vma_interval_tree_remove(next, root);
}
vma->vm_start = start;
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (root) {
if (adjust_next)
- vma_prio_tree_insert(next, root);
- vma_prio_tree_insert(vma, root);
+ vma_interval_tree_insert(next, root);
+ vma_interval_tree_insert(vma, root);
flush_dcache_mmap_unlock(mapping);
}
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end);
__insert_vm_struct(mm, insert);
}
- if (anon_vma)
+ if (anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ if (adjust_next)
+ anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock(anon_vma);
+ }
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end);
if (file) {
uprobe_munmap(next, next->vm_start, next->vm_end);
fput(file);
- if (next->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
}
if (next->anon_vma)
anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end);
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags)
{
- /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */
- if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
+ if (vma->vm_flags ^ vm_flags)
return 0;
if (vma->vm_file != file)
return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
mm->exec_vm += pages;
} else if (flags & stack_flags)
mm->stack_vm += pages;
- if (flags & (VM_RESERVED|VM_IO))
- mm->reserved_vm += pages;
}
#endif /* CONFIG_PROC_FS */
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
return 0;
/* Specialty mapping? */
- if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE))
+ if (vm_flags & VM_PFNMAP)
return 0;
/* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Clear old maps */
error = -ENOMEM;
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
@@ -1301,13 +1324,10 @@ munmap_back:
goto free_vma;
correct_wcount = 1;
}
- vma->vm_file = file;
- get_file(file);
+ vma->vm_file = get_file(file);
error = file->f_op->mmap(file, vma);
if (error)
goto unmap_and_free_vma;
- if (vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
/* Can addr have changed??
*
@@ -1758,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
+ anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma);
+ validate_mm(vma->vm_mm);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
if (grow <= vma->vm_pgoff) {
error = acct_stack_growth(vma, size, grow);
if (!error) {
+ anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
+ anon_vma_interval_tree_post_update_vma(vma);
perf_event_mmap(vma);
}
}
}
vma_unlock_anon_vma(vma);
khugepaged_enter_vma_merge(vma);
+ validate_mm(vma->vm_mm);
return error;
}
@@ -1989,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (anon_vma_clone(new, vma))
goto out_free_mpol;
- if (new->vm_file) {
+ if (new->vm_file)
get_file(new->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);
@@ -2011,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
/* Clean everything up if vma_adjust failed. */
if (new->vm_ops && new->vm_ops->close)
new->vm_ops->close(new);
- if (new->vm_file) {
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
+ if (new->vm_file)
fput(new->vm_file);
- }
unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
@@ -2200,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
* Clear old maps. this also does some error checking for us
*/
munmap_back:
- vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
- if (vma && vma->vm_start < addr + len) {
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
goto munmap_back;
@@ -2315,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm)
* and into the inode's i_mmap tree. If vm_file is non-NULL
* then i_mmap_mutex is taken here.
*/
-int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct * __vma, * prev;
- struct rb_node ** rb_link, * rb_parent;
+ struct vm_area_struct *prev;
+ struct rb_node **rb_link, *rb_parent;
/*
* The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2336,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
BUG_ON(vma->anon_vma);
vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
}
- __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent);
- if (__vma && __vma->vm_start < vma->vm_end)
+ if (find_vma_links(mm, vma->vm_start, vma->vm_end,
+ &prev, &rb_link, &rb_parent))
return -ENOMEM;
if ((vma->vm_flags & VM_ACCOUNT) &&
security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2352,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
* prior to moving page table entries, to effect an mremap move.
*/
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
- unsigned long addr, unsigned long len, pgoff_t pgoff)
+ unsigned long addr, unsigned long len, pgoff_t pgoff,
+ bool *need_rmap_locks)
{
struct vm_area_struct *vma = *vmap;
unsigned long vma_start = vma->vm_start;
@@ -2371,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
- find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+ if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
+ return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
if (new_vma) {
@@ -2393,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* linear if there are no pages mapped yet.
*/
VM_BUG_ON(faulted_in_anon_vma);
- *vmap = new_vma;
- } else
- anon_vma_moveto_tail(new_vma);
+ *vmap = vma = new_vma;
+ }
+ *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
} else {
new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new_vma) {
*new_vma = *vma;
+ new_vma->vm_start = addr;
+ new_vma->vm_end = addr + len;
+ new_vma->vm_pgoff = pgoff;
pol = mpol_dup(vma_policy(vma));
if (IS_ERR(pol))
goto out_free_vma;
+ vma_set_policy(new_vma, pol);
INIT_LIST_HEAD(&new_vma->anon_vma_chain);
if (anon_vma_clone(new_vma, vma))
goto out_free_mempol;
- vma_set_policy(new_vma, pol);
- new_vma->vm_start = addr;
- new_vma->vm_end = addr + len;
- new_vma->vm_pgoff = pgoff;
- if (new_vma->vm_file) {
+ if (new_vma->vm_file)
get_file(new_vma->vm_file);
-
- if (vma->vm_flags & VM_EXECUTABLE)
- added_exe_file_vma(mm);
- }
if (new_vma->vm_ops && new_vma->vm_ops->open)
new_vma->vm_ops->open(new_vma);
vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
}
}
return new_vma;
@@ -2536,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
- if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
/*
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
@@ -2552,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* anon_vma->root->mutex.
*/
if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->root->head.next))
+ &anon_vma->root->rb_root.rb_node))
BUG();
}
}
@@ -2593,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* A single task can't take more than one mm_take_all_locks() in a row
* or it would deadlock.
*
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
* mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
@@ -2640,13 +2658,13 @@ out_unlock:
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
- if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
+ if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
/*
* The LSB of head.next can't change to 0 from under
* us because we hold the mm_all_locks_mutex.
*
* We must however clear the bitflag before unlocking
- * the vma so the users using the anon_vma->head will
+ * the vma so the users using the anon_vma->rb_root will
* never see our bitflag.
*
* No need of atomic instructions here, head.next
@@ -2654,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
* anon_vma->root->mutex.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->root->head.next))
+ &anon_vma->root->rb_root.rb_node))
BUG();
anon_vma_unlock(anon_vma);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..8a5ac8c686b0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
+#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
+
/*
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
* in parallel despite there being no task using this mm any more,
* through the vmas outside of the exit_mmap context, such as with
* vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
* can't go away from under us as exit_mmap holds an mm_count pin
* itself.
*/
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
/*
- * RCU here will block mmu_notifier_unregister until
+ * SRCU here will block mmu_notifier_unregister until
* ->release returns.
*/
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
/*
* if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * synchronize_rcu here prevents mmu_notifier_release to
+ * synchronize_srcu here prevents mmu_notifier_release to
* return to exit_mmap (which would proceed freeing all pages
* in the mm) until the ->release method returns, if it was
* invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
* The mmu_notifier_mm can't go away from under us because one
* mm_count is hold by exit_mmap.
*/
- synchronize_rcu();
+ synchronize_srcu(&srcu);
}
/*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0;
+ int young = 0, id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_flush_young)
young |= mn->ops->clear_flush_young(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
return young;
}
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0;
+ int young = 0, id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->test_young) {
young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
break;
}
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
return young;
}
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->change_pte)
mn->ops->change_pte(mn, mm, address, pte);
- /*
- * Some drivers don't have change_pte,
- * so we must call invalidate_page in that case.
- */
- else if (mn->ops->invalidate_page)
- mn->ops->invalidate_page(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_page)
mn->ops->invalidate_page(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start)
mn->ops->invalidate_range_start(mn, mm, start, end);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +195,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ /*
+ * Verify that mmu_notifier_init() already run and the global srcu is
+ * initialized.
+ */
+ BUG_ON(!srcu.per_cpu_ref);
+
ret = -ENOMEM;
mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
if (unlikely(!mmu_notifier_mm))
@@ -201,11 +210,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
down_write(&mm->mmap_sem);
ret = mm_take_all_locks(mm);
if (unlikely(ret))
- goto out_cleanup;
+ goto out_clean;
if (!mm_has_notifiers(mm)) {
INIT_HLIST_HEAD(&mmu_notifier_mm->list);
spin_lock_init(&mmu_notifier_mm->lock);
+
mm->mmu_notifier_mm = mmu_notifier_mm;
mmu_notifier_mm = NULL;
}
@@ -224,10 +234,9 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);
-out_cleanup:
+out_clean:
if (take_mmap_sem)
up_write(&mm->mmap_sem);
- /* kfree() does nothing if mmu_notifier_mm is NULL */
kfree(mmu_notifier_mm);
out:
BUG_ON(atomic_read(&mm->mm_users) <= 0);
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
/*
* This releases the mm_count pin automatically and frees the mm
* structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
* calling mmu_notifier_unregister. ->release or any other notifier
* method may be invoked concurrently with mmu_notifier_unregister,
* and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
if (!hlist_unhashed(&mn->hlist)) {
/*
- * RCU here will force exit_mmap to wait ->release to finish
+ * SRCU here will force exit_mmap to wait ->release to finish
* before freeing the pages.
*/
- rcu_read_lock();
+ int id;
+ id = srcu_read_lock(&srcu);
/*
* exit_mmap will block in mmu_notifier_release to
* guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
*/
if (mn->ops->release)
mn->ops->release(mn, mm);
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
spin_lock(&mm->mmu_notifier_mm->lock);
hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
* Wait any running method to finish, of course including
* ->release if it was run by mmu_notifier_relase instead of us.
*/
- synchronize_rcu();
+ synchronize_srcu(&srcu);
BUG_ON(atomic_read(&mm->mm_count) <= 0);
mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+ return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long old_addr, unsigned long old_end,
struct vm_area_struct *new_vma, pmd_t *new_pmd,
- unsigned long new_addr)
+ unsigned long new_addr, bool need_rmap_locks)
{
struct address_space *mapping = NULL;
+ struct anon_vma *anon_vma = NULL;
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
- if (vma->vm_file) {
- /*
- * Subtle point from Rajesh Venkatasubramanian: before
- * moving file-based ptes, we must lock truncate_pagecache
- * out, since it might clean the dst vma before the src vma,
- * and we propagate stale pages into the dst afterward.
- */
- mapping = vma->vm_file->f_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
+ /*
+ * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
+ * locks to ensure that rmap will always observe either the old or the
+ * new ptes. This is the easiest way to avoid races with
+ * truncate_pagecache(), page migration, etc...
+ *
+ * When need_rmap_locks is false, we use other ways to avoid
+ * such races:
+ *
+ * - During exec() shift_arg_pages(), we use a specially tagged vma
+ * which rmap call sites look for using is_vma_temporary_stack().
+ *
+ * - During mremap(), new_vma is often known to be placed after vma
+ * in rmap traversal order. This ensures rmap will always observe
+ * either the old pte, or the new pte, or both (the page table locks
+ * serialize access to individual ptes, but only rmap traversal
+ * order guarantees that we won't miss both the old and new ptes).
+ */
+ if (need_rmap_locks) {
+ if (vma->vm_file) {
+ mapping = vma->vm_file->f_mapping;
+ mutex_lock(&mapping->i_mmap_mutex);
+ }
+ if (vma->anon_vma) {
+ anon_vma = vma->anon_vma;
+ anon_vma_lock(anon_vma);
+ }
}
/*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
spin_unlock(new_ptl);
pte_unmap(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl);
+ if (anon_vma)
+ anon_vma_unlock(anon_vma);
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
- unsigned long new_addr, unsigned long len)
+ unsigned long new_addr, unsigned long len,
+ bool need_rmap_locks)
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
bool need_flush = false;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+ mmun_start = old_addr;
+ mmun_end = old_end;
+ mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (extent > LATENCY_LIMIT)
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
- new_vma, new_pmd, new_addr);
+ new_vma, new_pmd, new_addr, need_rmap_locks);
need_flush = true;
}
if (likely(need_flush))
flush_tlb_range(vma, old_end-len, old_addr);
- mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
+ mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
return len + old_addr - old_end; /* how much done */
}
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long hiwater_vm;
int split = 0;
int err;
+ bool need_rmap_locks;
/*
* We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return err;
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
- new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff);
+ new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+ &need_rmap_locks);
if (!new_vma)
return -ENOMEM;
- moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
+ moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
+ need_rmap_locks);
if (moved_len < old_len) {
/*
- * Before moving the page tables from the new vma to
- * the old vma, we need to be sure the old vma is
- * queued after new vma in the same_anon_vma list to
- * prevent SMP races with rmap_walk (that could lead
- * rmap_walk to miss some page table).
- */
- anon_vma_moveto_tail(vma);
-
- /*
* On error, move entries back from new area to old,
* which will succeed since page tables still there,
* and then proceed to unmap new area instead of old.
*/
- move_page_tables(new_vma, new_addr, vma, old_addr, moved_len);
+ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
+ true);
vma = new_vma;
old_len = new_len;
old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return 0;
__free_pages_memory(start_pfn, end_pfn);
+ fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
+ start_pfn, end_pfn);
return end_pfn - start_pfn;
}
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
phys_addr_t start, end, size;
u64 i;
+ reset_zone_present_pages();
for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
count += __free_memory_core(start, end);
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
* We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesn't have RAM installed
* low ram will be on Node1
- * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
- * will be used instead of only Node0 related
*/
return free_low_memory_core_early(MAX_NUMNODES);
}
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872de..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
- vma_prio_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
mutex_unlock(&mapping->i_mmap_mutex);
}
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
kenter("%p", vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
- if (vma->vm_file) {
+ if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(mm);
- }
put_nommu_region(vma->vm_region);
kmem_cache_free(vm_area_cachep, vma);
}
@@ -1282,14 +1279,8 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_pgoff = pgoff;
if (file) {
- region->vm_file = file;
- get_file(file);
- vma->vm_file = file;
- get_file(file);
- if (vm_flags & VM_EXECUTABLE) {
- added_exe_file_vma(current->mm);
- vma->vm_mm = current->mm;
- }
+ region->vm_file = get_file(file);
+ vma->vm_file = get_file(file);
}
down_write(&nommu_region_sem);
@@ -1442,8 +1433,6 @@ error:
kmem_cache_free(vm_region_jar, region);
if (vma->vm_file)
fput(vma->vm_file);
- if (vma->vm_flags & VM_EXECUTABLE)
- removed_exe_file_vma(vma->vm_mm);
kmem_cache_free(vm_area_cachep, vma);
kleave(" = %d", ret);
return ret;
@@ -1822,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
if (addr != (pfn << PAGE_SHIFT))
return -EINVAL;
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
@@ -1963,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
+int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long size, pgoff_t pgoff)
+{
+ BUG();
+ return 0;
+}
+EXPORT_SYMBOL(generic_file_remap_pages);
+
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
@@ -2047,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
size_t newsize)
{
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
struct vm_region *region;
pgoff_t low, high;
size_t r_size, r_top;
@@ -2059,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
mutex_lock(&inode->i_mapping->i_mmap_mutex);
/* search for VMAs that fall within the dead zone */
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- low, high) {
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
/* found one - only interested if it's shared out of the page
* cache */
if (vma->vm_flags & VM_SHARED) {
@@ -2076,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
* we don't check for any regions that start beyond the EOF as there
* shouldn't be any
*/
- vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
- 0, ULONG_MAX) {
+ vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
+ 0, ULONG_MAX) {
if (!(vma->vm_flags & VM_SHARED))
continue;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_adj=%d, oom_score_adj=%d\n",
- current->comm, gfp_mask, order, current->signal->oom_adj,
+ "oom_score_adj=%d\n",
+ current->comm, gfp_mask, order,
current->signal->oom_score_adj);
cpuset_print_task_mems_allowed(current);
task_unlock(current);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5ad5ce23c1e0..830893b2b3c7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1602,10 +1602,18 @@ void writeback_set_ratelimit(void)
}
static int __cpuinit
-ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
+ratelimit_handler(struct notifier_block *self, unsigned long action,
+ void *hcpu)
{
- writeback_set_ratelimit();
- return NOTIFY_DONE;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DEAD:
+ writeback_set_ratelimit();
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
}
static struct notifier_block __cpuinitdata ratelimit_nb = {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..5b74de6702e0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
if (page_is_guard(buddy)) {
clear_page_guard_flag(buddy);
set_page_private(page, 0);
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ __mod_zone_freepage_state(zone, 1 << order,
+ migratetype);
} else {
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
@@ -597,17 +598,6 @@ out:
zone->free_area[order].nr_free++;
}
-/*
- * free_page_mlock() -- clean up attempts to free and mlocked() page.
- * Page should not be on lru, so no need to fix that up.
- * free_pages_check() will verify...
- */
-static inline void free_page_mlock(struct page *page)
-{
- __dec_zone_page_state(page, NR_MLOCK);
- __count_vm_event(UNEVICTABLE_MLOCKFREED);
-}
-
static inline int free_pages_check(struct page *page)
{
if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
batch_free = to_free;
do {
+ int mt; /* migratetype of the to-be-freed page */
+
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
+ mt = get_freepage_migratetype(page);
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
- __free_one_page(page, zone, 0, page_private(page));
- trace_mm_page_pcpu_drain(page, 0, page_private(page));
+ __free_one_page(page, zone, 0, mt);
+ trace_mm_page_pcpu_drain(page, 0, mt);
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
} while (--to_free && --batch_free && !list_empty(list));
}
__mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
zone->pages_scanned = 0;
__free_one_page(page, zone, order, migratetype);
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+ if (unlikely(migratetype != MIGRATE_ISOLATE))
+ __mod_zone_freepage_state(zone, 1 << order, migratetype);
spin_unlock(&zone->lock);
}
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
static void __free_pages_ok(struct page *page, unsigned int order)
{
unsigned long flags;
- int wasMlocked = __TestClearPageMlocked(page);
+ int migratetype;
if (!free_pages_prepare(page, order))
return;
local_irq_save(flags);
- if (unlikely(wasMlocked))
- free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, order,
- get_pageblock_migratetype(page));
+ migratetype = get_pageblock_migratetype(page);
+ set_freepage_migratetype(page, migratetype);
+ free_one_page(page_zone(page), page, order, migratetype);
local_irq_restore(flags);
}
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
set_page_guard_flag(&page[size]);
set_page_private(&page[size], high);
/* Guard pages are not available for any usage */
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+ __mod_zone_freepage_state(zone, -(1 << high),
+ migratetype);
continue;
}
#endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
-static int move_freepages(struct zone *zone,
+int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
int migratetype)
{
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
order = page_order(page);
list_move(&page->lru,
&zone->free_area[order].free_list[migratetype]);
+ set_freepage_migratetype(page, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
mt = migratetype;
}
- set_page_private(page, mt);
+ set_freepage_migratetype(page, mt);
list = &page->lru;
+ if (is_migrate_cma(mt))
+ __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ -(1 << order));
}
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
struct per_cpu_pages *pcp;
unsigned long flags;
int migratetype;
- int wasMlocked = __TestClearPageMlocked(page);
if (!free_pages_prepare(page, 0))
return;
migratetype = get_pageblock_migratetype(page);
- set_page_private(page, migratetype);
+ set_freepage_migratetype(page, migratetype);
local_irq_save(flags);
- if (unlikely(wasMlocked))
- free_page_mlock(page);
__count_vm_event(PGFREE);
/*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
}
/*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
+ * Similar to the split_page family of functions except that the page
+ * required at the given order and being isolated now to prevent races
+ * with parallel allocators
*/
-int split_free_page(struct page *page)
+int capture_free_page(struct page *page, int alloc_order, int migratetype)
{
unsigned int order;
unsigned long watermark;
struct zone *zone;
+ int mt;
BUG_ON(!PageBuddy(page));
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
list_del(&page->lru);
zone->free_area[order].nr_free--;
rmv_page_order(page);
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
- /* Split into individual pages */
- set_page_refcounted(page);
- split_page(page, order);
+ mt = get_pageblock_migratetype(page);
+ if (unlikely(mt != MIGRATE_ISOLATE))
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+ if (alloc_order != order)
+ expand(zone, page, alloc_order, order,
+ &zone->free_area[order], migratetype);
+
+ /* Set the pageblock if the captured page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
}
}
- return 1 << order;
+ return 1UL << order;
+}
+
+/*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+ unsigned int order;
+ int nr_pages;
+
+ BUG_ON(!PageBuddy(page));
+ order = page_order(page);
+
+ nr_pages = capture_free_page(page, order, 0);
+ if (!nr_pages)
+ return 0;
+
+ /* Split into individual pages */
+ set_page_refcounted(page);
+ split_page(page, order);
+ return nr_pages;
}
/*
@@ -1484,7 +1509,8 @@ again:
spin_unlock(&zone->lock);
if (!page)
goto failed;
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pageblock_migratetype(page));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
return NULL;
}
-/* The ALLOC_WMARK bits are used as an index to zone->watermark */
-#define ALLOC_WMARK_MIN WMARK_MIN
-#define ALLOC_WMARK_LOW WMARK_LOW
-#define ALLOC_WMARK_HIGH WMARK_HIGH
-#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
-
-/* Mask to get the watermark bits */
-#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
-
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-
#ifdef CONFIG_FAIL_PAGE_ALLOC
static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
-
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
if (free_pages <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
}
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
+}
+
+static void __paginginit init_zone_allows_reclaim(int nid)
+{
+ int i;
+
+ for_each_online_node(i)
+ if (node_distance(nid, i) <= RECLAIM_DISTANCE)
+ node_set(i, NODE_DATA(nid)->reclaim_nodes);
+ else
+ zone_reclaim_mode = 1;
+}
+
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
static void zlc_clear_zones_full(struct zonelist *zonelist)
{
}
+
+static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
+{
+ return true;
+}
+
+static inline void init_zone_allows_reclaim(int nid)
+{
+}
#endif /* CONFIG_NUMA */
/*
@@ -1886,7 +1928,8 @@ zonelist_scan:
did_zlc_setup = 1;
}
- if (zone_reclaim_mode == 0)
+ if (zone_reclaim_mode == 0 ||
+ !zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
/*
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
bool *contended_compaction, bool *deferred_compaction,
unsigned long *did_some_progress)
{
- struct page *page;
+ struct page *page = NULL;
if (!order)
return NULL;
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, sync_migration,
- contended_compaction);
+ contended_compaction, &page);
current->flags &= ~PF_MEMALLOC;
- if (*did_some_progress != COMPACT_SKIPPED) {
+ /* If compaction captured a page, prep and use it */
+ if (page) {
+ prep_new_page(page, order, gfp_mask);
+ goto got_page;
+ }
+
+ if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
drain_pages(get_cpu());
put_cpu();
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
alloc_flags & ~ALLOC_NO_WATERMARKS,
preferred_zone, migratetype);
if (page) {
+got_page:
+ preferred_zone->compact_blockskip_flush = false;
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
if (order >= preferred_zone->compact_order_failed)
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
-
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
return alloc_flags;
}
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapd(order, zonelist, high_zoneidx,
- zone_idx(preferred_zone));
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2494,7 @@ rebalance:
* system then fail the allocation instead of entering direct reclaim.
*/
if ((deferred_compaction || contended_compaction) &&
- (gfp_mask & __GFP_NO_KSWAPD))
+ (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
goto nopage;
/* Try direct reclaim and then allocating */
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
gfp_mask &= gfp_allowed_mask;
@@ -2569,9 +2623,13 @@ retry_cpuset:
if (!preferred_zone)
goto out;
+#ifdef CONFIG_CMA
+ if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+#endif
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
+ zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page))
page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter)
" unevictable:%lu"
" dirty:%lu writeback:%lu unstable:%lu\n"
" free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
- " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
+ " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
+ " free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
- global_page_state(NR_BOUNCE));
+ global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
int i;
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
(zone->all_unreclaimable ? "yes" : "no")
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
j = 0;
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
- int distance = node_distance(local_node, node);
-
- /*
- * If another node is sufficiently far away then it is better
- * to reclaim pages in a zone before going off node.
- */
- if (distance > RECLAIM_DISTANCE)
- zone_reclaim_mode = 1;
-
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (distance != node_distance(local_node, prev_node))
+ if (node_distance(local_node, node) !=
+ node_distance(local_node, prev_node))
node_load[node] = load;
prev_node = node;
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
- zone->compact_cached_free_pfn = zone->zone_start_pfn +
- zone->spanned_pages;
- zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
-#endif
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
+ init_zone_allows_reclaim(nid);
calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
zone_movable_pfn[i] << PAGE_SHIFT);
}
- /* Print out the early_node_map[] */
+ /* Print out the early node map */
printk("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
pageblock_nr_pages));
}
-static struct page *
-__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
- int **resultp)
-{
- gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
- return alloc_page(gfp_mask);
-}
-
/* [start, end) must belong to a single zone. */
-static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+static int __alloc_contig_migrate_range(struct compact_control *cc,
+ unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
-
+ unsigned long nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
- struct compact_control cc = {
- .nr_migratepages = 0,
- .order = -1,
- .zone = page_zone(pfn_to_page(start)),
- .sync = true,
- };
- INIT_LIST_HEAD(&cc.migratepages);
-
migrate_prep_local();
- while (pfn < end || !list_empty(&cc.migratepages)) {
+ while (pfn < end || !list_empty(&cc->migratepages)) {
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
- if (list_empty(&cc.migratepages)) {
- cc.nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc.zone, &cc,
- pfn, end);
+ if (list_empty(&cc->migratepages)) {
+ cc->nr_migratepages = 0;
+ pfn = isolate_migratepages_range(cc->zone, cc,
+ pfn, end, true);
if (!pfn) {
ret = -EINTR;
break;
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
break;
}
- ret = migrate_pages(&cc.migratepages,
- __alloc_contig_migrate_alloc,
+ nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
+ &cc->migratepages);
+ cc->nr_migratepages -= nr_reclaimed;
+
+ ret = migrate_pages(&cc->migratepages,
+ alloc_migrate_target,
0, false, MIGRATE_SYNC);
}
- putback_lru_pages(&cc.migratepages);
+ putback_lru_pages(&cc->migratepages);
return ret > 0 ? 0 : ret;
}
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned long outer_start, outer_end;
int ret = 0, order;
+ struct compact_control cc = {
+ .nr_migratepages = 0,
+ .order = -1,
+ .zone = page_zone(pfn_to_page(start)),
+ .sync = true,
+ .ignore_skip_hint = true,
+ };
+ INIT_LIST_HEAD(&cc.migratepages);
+
/*
* What we do here is we mark all pageblocks in range as
* MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -5781,9 +5825,9 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype);
if (ret)
- goto done;
+ return ret;
- ret = __alloc_contig_migrate_range(start, end);
+ ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret)
goto done;
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
__reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
/* Grab isolated pages from freelists. */
- outer_end = isolate_freepages_range(outer_start, end);
+ outer_end = isolate_freepages_range(&cc, outer_start, end);
if (!outer_end) {
ret = -EBUSY;
goto done;
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
local_irq_save(flags);
if (pcp->count > 0)
free_pcppages_bulk(zone, pcp->count, pcp);
+ drain_zonestat(zone, pset);
setup_pageset(pset, batch);
local_irq_restore(flags);
}
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
void zone_pcp_reset(struct zone *zone)
{
unsigned long flags;
+ int cpu;
+ struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
local_irq_save(flags);
if (zone->pageset != &boot_pageset) {
+ for_each_online_cpu(cpu) {
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ drain_zonestat(zone, pset);
+ }
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page)
dump_page_flags(page->flags);
mem_cgroup_print_bad_page(page);
}
+
+/* reset zone->present_pages */
+void reset_zone_present_pages(void)
+{
+ struct zone *z;
+ int i, nid;
+
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ z = NODE_DATA(nid)->node_zones + i;
+ z->present_pages = 0;
+ }
+ }
+}
+
+/* calculate zone's present pages in buddy system */
+void fixup_zone_present_pages(int nid, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct zone *z;
+ unsigned long zone_start_pfn, zone_end_pfn;
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ z = NODE_DATA(nid)->node_zones + i;
+ zone_start_pfn = z->zone_start_pfn;
+ zone_end_pfn = zone_start_pfn + z->spanned_pages;
+
+ /* if the two regions intersect */
+ if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
+ z->present_pages += min(end_pfn, zone_end_pfn) -
+ max(start_pfn, zone_start_pfn);
+ }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
out:
if (!ret) {
+ unsigned long nr_pages;
+ int migratetype = get_pageblock_migratetype(page);
+
set_pageblock_isolate(page);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+
+ __mod_zone_freepage_state(zone, -nr_pages, migratetype);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
void unset_migratetype_isolate(struct page *page, unsigned migratetype)
{
struct zone *zone;
- unsigned long flags;
+ unsigned long flags, nr_pages;
+
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
goto out;
- move_freepages_block(zone, page, migratetype);
+ nr_pages = move_freepages_block(zone, page, migratetype);
+ __mod_zone_freepage_state(zone, nr_pages, migratetype);
restore_pageblock_isolate(page, migratetype);
out:
spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
continue;
}
page = pfn_to_page(pfn);
- if (PageBuddy(page))
+ if (PageBuddy(page)) {
+ /*
+ * If race between isolatation and allocation happens,
+ * some free pages could be in MIGRATE_MOVABLE list
+ * although pageblock's migratation type of the page
+ * is MIGRATE_ISOLATE. Catch it and move the page into
+ * MIGRATE_ISOLATE list.
+ */
+ if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
+ struct page *end_page;
+
+ end_page = page + (1 << page_order(page)) - 1;
+ move_freepages(page_zone(page), page, end_page,
+ MIGRATE_ISOLATE);
+ }
pfn += 1 << page_order(page);
+ }
else if (page_count(page) == 0 &&
- page_private(page) == MIGRATE_ISOLATE)
+ get_freepage_migratetype(page) == MIGRATE_ISOLATE)
pfn += 1;
else
break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
spin_unlock_irqrestore(&zone->lock, flags);
return ret ? 0 : -EBUSY;
}
+
+struct page *alloc_migrate_target(struct page *page, unsigned long private,
+ int **resultp)
+{
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index bb4be7435ce3..ddc5efb9c5bb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
#ifdef CONFIG_SMP
-const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
+const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
+{
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ if (!mm->pmd_huge_pte)
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * mm/prio_tree.c - priority search tree for mapping->i_mmap
- *
- * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
- *
- * This file is released under the GPL v2.
- *
- * Based on the radix priority search tree proposed by Edward M. McCreight
- * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
- *
- * 02Feb2004 Initial version
- */
-
-#include <linux/mm.h>
-#include <linux/prio_tree.h>
-#include <linux/prefetch.h>
-
-/*
- * See lib/prio_tree.c for details on the general radix priority search tree
- * code.
- */
-
-/*
- * The following #defines are mirrored from lib/prio_tree.c. They're only used
- * for debugging, and should be removed (along with the debugging code using
- * them) when switching also VMAs to the regular prio_tree code.
- */
-
-#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
-#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
-/* avoid overflow */
-#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
-
-/*
- * Radix priority search tree for address_space->i_mmap
- *
- * For each vma that map a unique set of file pages i.e., unique [radix_index,
- * heap_index] value, we have a corresponding priority search tree node. If
- * multiple vmas have identical [radix_index, heap_index] value, then one of
- * them is used as a tree node and others are stored in a vm_set list. The tree
- * node points to the first vma (head) of the list using vm_set.head.
- *
- * prio_tree_root
- * |
- * A vm_set.head
- * / \ /
- * L R -> H-I-J-K-M-N-O-P-Q-S
- * ^ ^ <-- vm_set.list -->
- * tree nodes
- *
- * We need some way to identify whether a vma is a tree node, head of a vm_set
- * list, or just a member of a vm_set list. We cannot use vm_flags to store
- * such information. The reason is, in the above figure, it is possible that
- * vm_flags' of R and H are covered by the different mmap_sems. When R is
- * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
- * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
- * That's why some trick involving shared.vm_set.parent is used for identifying
- * tree nodes and list head nodes.
- *
- * vma radix priority search tree node rules:
- *
- * vma->shared.vm_set.parent != NULL ==> a tree node
- * vma->shared.vm_set.head != NULL ==> list of others mapping same range
- * vma->shared.vm_set.head == NULL ==> no others map the same range
- *
- * vma->shared.vm_set.parent == NULL
- * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
- * vma->shared.vm_set.head == NULL ==> a list node
- */
-
-/*
- * Add a new vma known to map the same set of pages as the old vma:
- * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
- * Note that it just happens to work correctly on i_mmap_nonlinear too.
- */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
-{
- /* Leave these BUG_ONs till prio_tree patch stabilizes */
- BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
- BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
-
- vma->shared.vm_set.head = NULL;
- vma->shared.vm_set.parent = NULL;
-
- if (!old->shared.vm_set.parent)
- list_add(&vma->shared.vm_set.list,
- &old->shared.vm_set.list);
- else if (old->shared.vm_set.head)
- list_add_tail(&vma->shared.vm_set.list,
- &old->shared.vm_set.head->shared.vm_set.list);
- else {
- INIT_LIST_HEAD(&vma->shared.vm_set.list);
- vma->shared.vm_set.head = old;
- old->shared.vm_set.head = vma;
- }
-}
-
-void vma_prio_tree_insert(struct vm_area_struct *vma,
- struct prio_tree_root *root)
-{
- struct prio_tree_node *ptr;
- struct vm_area_struct *old;
-
- vma->shared.vm_set.head = NULL;
-
- ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
- if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
- old = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- vma_prio_tree_add(vma, old);
- }
-}
-
-void vma_prio_tree_remove(struct vm_area_struct *vma,
- struct prio_tree_root *root)
-{
- struct vm_area_struct *node, *head, *new_head;
-
- if (!vma->shared.vm_set.head) {
- if (!vma->shared.vm_set.parent)
- list_del_init(&vma->shared.vm_set.list);
- else
- raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
- } else {
- /* Leave this BUG_ON till prio_tree patch stabilizes */
- BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
- if (vma->shared.vm_set.parent) {
- head = vma->shared.vm_set.head;
- if (!list_empty(&head->shared.vm_set.list)) {
- new_head = list_entry(
- head->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&head->shared.vm_set.list);
- } else
- new_head = NULL;
-
- raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
- &head->shared.prio_tree_node);
- head->shared.vm_set.head = new_head;
- if (new_head)
- new_head->shared.vm_set.head = head;
-
- } else {
- node = vma->shared.vm_set.head;
- if (!list_empty(&vma->shared.vm_set.list)) {
- new_head = list_entry(
- vma->shared.vm_set.list.next,
- struct vm_area_struct,
- shared.vm_set.list);
- list_del_init(&vma->shared.vm_set.list);
- node->shared.vm_set.head = new_head;
- new_head->shared.vm_set.head = node;
- } else
- node->shared.vm_set.head = NULL;
- }
- }
-}
-
-/*
- * Helper function to enumerate vmas that map a given file page or a set of
- * contiguous file pages. The function returns vmas that at least map a single
- * page in the given range of contiguous file pages.
- */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
- struct prio_tree_iter *iter)
-{
- struct prio_tree_node *ptr;
- struct vm_area_struct *next;
-
- if (!vma) {
- /*
- * First call is with NULL vma
- */
- ptr = prio_tree_next(iter);
- if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
- return next;
- } else
- return NULL;
- }
-
- if (vma->shared.vm_set.parent) {
- if (vma->shared.vm_set.head) {
- next = vma->shared.vm_set.head;
- prefetch(next->shared.vm_set.list.next);
- return next;
- }
- } else {
- next = list_entry(vma->shared.vm_set.list.next,
- struct vm_area_struct, shared.vm_set.list);
- if (!next->shared.vm_set.head) {
- prefetch(next->shared.vm_set.list.next);
- return next;
- }
- }
-
- ptr = prio_tree_next(iter);
- if (ptr) {
- next = prio_tree_entry(ptr, struct vm_area_struct,
- shared.prio_tree_node);
- prefetch(next->shared.vm_set.head);
- return next;
- } else
- return NULL;
-}
diff --git a/mm/readahead.c b/mm/readahead.c
index ea8f8fa21649..7963f2391236 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
{
ssize_t ret;
- struct file *file;
+ struct fd f;
ret = -EBADF;
- file = fget(fd);
- if (file) {
- if (file->f_mode & FMODE_READ) {
- struct address_space *mapping = file->f_mapping;
+ f = fdget(fd);
+ if (f.file) {
+ if (f.file->f_mode & FMODE_READ) {
+ struct address_space *mapping = f.file->f_mapping;
pgoff_t start = offset >> PAGE_CACHE_SHIFT;
pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
unsigned long len = end - start + 1;
- ret = do_readahead(mapping, file, start, len);
+ ret = do_readahead(mapping, f.file, start, len);
}
- fput(file);
+ fdput(f);
}
return ret;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..2ee1ef0f317b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
#include <asm/tlbflush.h>
@@ -127,12 +128,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
avc->vma = vma;
avc->anon_vma = anon_vma;
list_add(&avc->same_vma, &vma->anon_vma_chain);
-
- /*
- * It's critical to add new vmas to the tail of the anon_vma,
- * see comment in huge_memory.c:__split_huge_page().
- */
- list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+ anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}
/**
@@ -269,51 +265,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
}
/*
- * Some rmap walk that needs to find all ptes/hugepmds without false
- * negatives (like migrate and split_huge_page) running concurrent
- * with operations that copy or move pagetables (like mremap() and
- * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
- * list to be in a certain order: the dst_vma must be placed after the
- * src_vma in the list. This is always guaranteed by fork() but
- * mremap() needs to call this function to enforce it in case the
- * dst_vma isn't newly allocated and chained with the anon_vma_clone()
- * function but just an extension of a pre-existing vma through
- * vma_merge.
- *
- * NOTE: the same_anon_vma list can still be changed by other
- * processes while mremap runs because mremap doesn't hold the
- * anon_vma mutex to prevent modifications to the list while it
- * runs. All we need to enforce is that the relative order of this
- * process vmas isn't changing (we don't care about other vmas
- * order). Each vma corresponds to an anon_vma_chain structure so
- * there's no risk that other processes calling anon_vma_moveto_tail()
- * and changing the same_anon_vma list under mremap() will screw with
- * the relative order of this process vmas in the list, because we
- * they can't alter the order of any vma that belongs to this
- * process. And there can't be another anon_vma_moveto_tail() running
- * concurrently with mremap() coming from this process because we hold
- * the mmap_sem for the whole mremap(). fork() ordering dependency
- * also shouldn't be affected because fork() only cares that the
- * parent vmas are placed in the list before the child vmas and
- * anon_vma_moveto_tail() won't reorder vmas from either the fork()
- * parent or child.
- */
-void anon_vma_moveto_tail(struct vm_area_struct *dst)
-{
- struct anon_vma_chain *pavc;
- struct anon_vma *root = NULL;
-
- list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
- struct anon_vma *anon_vma = pavc->anon_vma;
- VM_BUG_ON(pavc->vma != dst);
- root = lock_anon_vma_root(root, anon_vma);
- list_del(&pavc->same_anon_vma);
- list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
- }
- unlock_anon_vma_root(root);
-}
-
-/*
* Attach vma to its own anon_vma, as well as to the anon_vmas that
* the corresponding VMA in the parent process is attached to.
* Returns 0 on success, non-zero on failure.
@@ -381,13 +332,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
struct anon_vma *anon_vma = avc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
- list_del(&avc->same_anon_vma);
+ anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
/*
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (list_empty(&anon_vma->head))
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root))
continue;
list_del(&avc->same_vma);
@@ -416,7 +367,7 @@ static void anon_vma_ctor(void *data)
mutex_init(&anon_vma->mutex);
atomic_set(&anon_vma->refcount, 0);
- INIT_LIST_HEAD(&anon_vma->head);
+ anon_vma->rb_root = RB_ROOT;
}
void __init anon_vma_init(void)
@@ -560,22 +511,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
/*
* At what user virtual address is page expected in @vma?
- * Returns virtual address or -EFAULT if page's index/offset is not
- * within the range mapped the @vma.
*/
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long address;
if (unlikely(is_vm_hugetlb_page(vma)))
pgoff = page->index << huge_page_order(page_hstate(page));
- address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- /* page should be within @vma mapping range */
- return -EFAULT;
- }
+
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+
return address;
}
@@ -585,6 +540,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
+ unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -600,7 +556,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
return -EFAULT;
} else
return -EFAULT;
- return vma_address(page, vma);
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ return -EFAULT;
+ return address;
}
/*
@@ -674,8 +633,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
pte_t *pte;
spinlock_t *ptl;
- address = vma_address(page, vma);
- if (address == -EFAULT) /* out of vma range */
+ address = __vma_address(page, vma);
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
return 0;
pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
if (!pte) /* the page is not in this mm */
@@ -769,6 +728,7 @@ static int page_referenced_anon(struct page *page,
{
unsigned int mapcount;
struct anon_vma *anon_vma;
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int referenced = 0;
@@ -777,11 +737,10 @@ static int page_referenced_anon(struct page *page,
return referenced;
mapcount = page_mapcount(page);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -820,7 +779,6 @@ static int page_referenced_file(struct page *page,
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int referenced = 0;
/*
@@ -846,10 +804,8 @@ static int page_referenced_file(struct page *page,
*/
mapcount = page_mapcount(page);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
@@ -929,7 +885,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
pte_t entry;
flush_cache_page(vma, address, pte_pfn(*pte));
- entry = ptep_clear_flush_notify(vma, address, pte);
+ entry = ptep_clear_flush(vma, address, pte);
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(mm, address, pte, entry);
@@ -937,6 +893,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
pte_unmap_unlock(pte, ptl);
+
+ if (ret)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
}
@@ -945,17 +904,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = 0;
BUG_ON(PageAnon(page));
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
if (vma->vm_flags & VM_SHARED) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret += page_mkclean_one(page, vma, address);
}
}
@@ -971,11 +927,8 @@ int page_mkclean(struct page *page)
if (page_mapped(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ if (mapping)
ret = page_mkclean_file(mapping, page);
- if (page_test_and_clear_dirty(page_to_pfn(page), 1))
- ret = 1;
- }
}
return ret;
@@ -1128,7 +1081,7 @@ void page_add_new_anon_rmap(struct page *page,
else
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
- if (page_evictable(page, vma))
+ if (!mlocked_vma_newpage(vma, page))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
else
add_page_to_unevictable_list(page);
@@ -1161,6 +1114,7 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
bool anon = PageAnon(page);
bool locked;
unsigned long flags;
@@ -1183,8 +1137,19 @@ void page_remove_rmap(struct page *page)
* this if the page is anon, so about to be freed; but perhaps
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
+ *
+ * And we can skip it on file pages, so long as the filesystem
+ * participates in dirty tracking; but need to catch shm and tmpfs
+ * and ramfs pages which have been modified since creation by read
+ * fault.
+ *
+ * Note that mapping must be decided above, before decrementing
+ * mapcount (which luckily provides a barrier): once page is unmapped,
+ * it could be truncated and page->mapping reset to NULL at any moment.
+ * Note also that we are relying on page_mapping(page) to set mapping
+ * to &swapper_space when PageSwapCache(page).
*/
- if ((!anon || PageSwapCache(page)) &&
+ if (mapping && !mapping_cap_account_dirty(mapping) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
@@ -1203,7 +1168,10 @@ void page_remove_rmap(struct page *page)
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1181,7 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+ return;
out:
if (!anon)
mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1225,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* Nuke the page table entry. */
flush_cache_page(vma, address, page_to_pfn(page));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* Move the dirty bit to the physical page now the pte is gone. */
if (pte_dirty(pteval))
@@ -1318,6 +1287,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
out_unmap:
pte_unmap_unlock(pte, ptl);
+ if (ret != SWAP_FAIL)
+ mmu_notifier_invalidate_page(mm, address);
out:
return ret;
@@ -1382,6 +1353,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
spinlock_t *ptl;
struct page *page;
unsigned long address;
+ unsigned long mmun_start; /* For mmu_notifiers */
+ unsigned long mmun_end; /* For mmu_notifiers */
unsigned long end;
int ret = SWAP_AGAIN;
int locked_vma = 0;
@@ -1405,6 +1378,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
if (!pmd_present(*pmd))
return ret;
+ mmun_start = address;
+ mmun_end = end;
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
/*
* If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
* keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1415,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
+ pteval = ptep_clear_flush(vma, address, pte);
/* If nonlinear, store the file page offset in the pte. */
if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1431,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
(*mapcount)--;
}
pte_unmap_unlock(pte - 1, ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (locked_vma)
up_read(&vma->vm_mm->mmap_sem);
return ret;
@@ -1492,6 +1470,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
{
struct anon_vma *anon_vma;
+ pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1499,7 +1478,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
if (!anon_vma)
return ret;
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address;
@@ -1516,8 +1496,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
continue;
address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = try_to_unmap_one(page, vma, address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
break;
@@ -1547,7 +1525,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
unsigned long cursor;
unsigned long max_nl_cursor = 0;
@@ -1555,10 +1532,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
unsigned int mapcount;
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = try_to_unmap_one(page, vma, address, flags);
if (ret != SWAP_AGAIN || !page_mapped(page))
goto out;
@@ -1576,7 +1551,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
goto out;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
if (cursor > max_nl_cursor)
max_nl_cursor = cursor;
@@ -1608,7 +1583,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
do {
list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
- shared.vm_set.list) {
+ shared.nonlinear) {
cursor = (unsigned long) vma->vm_private_data;
while ( cursor < max_nl_cursor &&
cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1606,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
* in locked vmas). Reset cursor on all unreserved nonlinear
* vmas, now forgetting on which ones it had fallen behind.
*/
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+ list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
vma->vm_private_data = NULL;
out:
mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1691,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
struct vm_area_struct *, unsigned long, void *), void *arg)
{
struct anon_vma *anon_vma;
+ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
@@ -1729,11 +1705,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (!anon_vma)
return ret;
anon_vma_lock(anon_vma);
- list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;
@@ -1748,16 +1722,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
- struct prio_tree_iter iter;
int ret = SWAP_AGAIN;
if (!mapping)
return ret;
mutex_lock(&mapping->i_mmap_mutex);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
- if (address == -EFAULT)
- continue;
ret = rmap_one(page, vma, address, arg);
if (ret != SWAP_AGAIN)
break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d4e184e2a38e..67afba5117f2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt;
/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128
-struct shmem_xattr {
- struct list_head list; /* anchored by shmem_inode_info->xattr_list */
- char *name; /* xattr name */
- size_t size;
- char value[0];
-};
-
/*
* shmem_fallocate and shmem_writepage communicate via inode->i_private
* (with i_mutex making sure that it has only one user at a time):
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
static void shmem_evict_inode(struct inode *inode)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_xattr *xattr, *nxattr;
if (inode->i_mapping->a_ops == &shmem_aops) {
shmem_unacct_size(info->flags, inode->i_size);
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode)
} else
kfree(info->symlink);
- list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
- kfree(xattr->name);
- kfree(xattr);
- }
+ simple_xattrs_free(&info->xattrs);
BUG_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
clear_inode(inode);
@@ -1350,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
@@ -1377,7 +1365,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
spin_lock_init(&info->lock);
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->swaplist);
- INIT_LIST_HEAD(&info->xattr_list);
+ simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
switch (mode & S_IFMT) {
@@ -2060,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
*/
/*
- * Allocate new xattr and copy in the value; but leave the name to callers.
- */
-static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
-{
- struct shmem_xattr *new_xattr;
- size_t len;
-
- /* wrap around? */
- len = sizeof(*new_xattr) + size;
- if (len <= sizeof(*new_xattr))
- return NULL;
-
- new_xattr = kmalloc(len, GFP_KERNEL);
- if (!new_xattr)
- return NULL;
-
- new_xattr->size = size;
- memcpy(new_xattr->value, value, size);
- return new_xattr;
-}
-
-/*
* Callback for security_inode_init_security() for acquiring xattrs.
*/
static int shmem_initxattrs(struct inode *inode,
@@ -2090,11 +2056,11 @@ static int shmem_initxattrs(struct inode *inode,
{
struct shmem_inode_info *info = SHMEM_I(inode);
const struct xattr *xattr;
- struct shmem_xattr *new_xattr;
+ struct simple_xattr *new_xattr;
size_t len;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
+ new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
if (!new_xattr)
return -ENOMEM;
@@ -2111,91 +2077,12 @@ static int shmem_initxattrs(struct inode *inode,
memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
xattr->name, len);
- spin_lock(&info->lock);
- list_add(&new_xattr->list, &info->xattr_list);
- spin_unlock(&info->lock);
+ simple_xattr_list_add(&info->xattrs, new_xattr);
}
return 0;
}
-static int shmem_xattr_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
-{
- struct shmem_inode_info *info;
- struct shmem_xattr *xattr;
- int ret = -ENODATA;
-
- info = SHMEM_I(dentry->d_inode);
-
- spin_lock(&info->lock);
- list_for_each_entry(xattr, &info->xattr_list, list) {
- if (strcmp(name, xattr->name))
- continue;
-
- ret = xattr->size;
- if (buffer) {
- if (size < xattr->size)
- ret = -ERANGE;
- else
- memcpy(buffer, xattr->value, xattr->size);
- }
- break;
- }
- spin_unlock(&info->lock);
- return ret;
-}
-
-static int shmem_xattr_set(struct inode *inode, const char *name,
- const void *value, size_t size, int flags)
-{
- struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_xattr *xattr;
- struct shmem_xattr *new_xattr = NULL;
- int err = 0;
-
- /* value == NULL means remove */
- if (value) {
- new_xattr = shmem_xattr_alloc(value, size);
- if (!new_xattr)
- return -ENOMEM;
-
- new_xattr->name = kstrdup(name, GFP_KERNEL);
- if (!new_xattr->name) {
- kfree(new_xattr);
- return -ENOMEM;
- }
- }
-
- spin_lock(&info->lock);
- list_for_each_entry(xattr, &info->xattr_list, list) {
- if (!strcmp(name, xattr->name)) {
- if (flags & XATTR_CREATE) {
- xattr = new_xattr;
- err = -EEXIST;
- } else if (new_xattr) {
- list_replace(&xattr->list, &new_xattr->list);
- } else {
- list_del(&xattr->list);
- }
- goto out;
- }
- }
- if (flags & XATTR_REPLACE) {
- xattr = new_xattr;
- err = -ENODATA;
- } else {
- list_add(&new_xattr->list, &info->xattr_list);
- xattr = NULL;
- }
-out:
- spin_unlock(&info->lock);
- if (xattr)
- kfree(xattr->name);
- kfree(xattr);
- return err;
-}
-
static const struct xattr_handler *shmem_xattr_handlers[] = {
#ifdef CONFIG_TMPFS_POSIX_ACL
&generic_acl_access_handler,
@@ -2226,6 +2113,7 @@ static int shmem_xattr_validate(const char *name)
static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
int err;
/*
@@ -2240,12 +2128,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
if (err)
return err;
- return shmem_xattr_get(dentry, name, buffer, size);
+ return simple_xattr_get(&info->xattrs, name, buffer, size);
}
static int shmem_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
int err;
/*
@@ -2260,15 +2149,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
if (err)
return err;
- if (size == 0)
- value = ""; /* empty EA, do not remove */
-
- return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
-
+ return simple_xattr_set(&info->xattrs, name, value, size, flags);
}
static int shmem_removexattr(struct dentry *dentry, const char *name)
{
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
int err;
/*
@@ -2283,45 +2169,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
if (err)
return err;
- return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
-}
-
-static bool xattr_is_trusted(const char *name)
-{
- return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+ return simple_xattr_remove(&info->xattrs, name);
}
static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- bool trusted = capable(CAP_SYS_ADMIN);
- struct shmem_xattr *xattr;
- struct shmem_inode_info *info;
- size_t used = 0;
-
- info = SHMEM_I(dentry->d_inode);
-
- spin_lock(&info->lock);
- list_for_each_entry(xattr, &info->xattr_list, list) {
- size_t len;
-
- /* skip "trusted." attributes for unprivileged callers */
- if (!trusted && xattr_is_trusted(xattr->name))
- continue;
-
- len = strlen(xattr->name) + 1;
- used += len;
- if (buffer) {
- if (size < used) {
- used = -ERANGE;
- break;
- }
- memcpy(buffer, xattr->name, len);
- buffer += len;
- }
- }
- spin_unlock(&info->lock);
-
- return used;
+ struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ return simple_xattr_list(&info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */
@@ -2366,12 +2220,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
{
struct inode *inode;
struct dentry *dentry = NULL;
- u64 inum = fid->raw[2];
- inum = (inum << 32) | fid->raw[1];
+ u64 inum;
if (fh_len < 3)
return NULL;
+ inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
shmem_match, fid->raw);
if (inode) {
@@ -2788,6 +2644,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
+ .remap_pages = generic_file_remap_pages,
};
static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2981,7 +2838,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index c6854759bcf1..33d3363658df 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#ifdef CONFIG_TRACING
-size_t slab_buffer_size(struct kmem_cache *cachep)
-{
- return cachep->size;
-}
-EXPORT_SYMBOL(slab_buffer_size);
-#endif
-
/*
* Do not go above this order unless 0 objects fit into the slab or
* overridden on the command line.
@@ -515,13 +507,6 @@ EXPORT_SYMBOL(slab_buffer_size);
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
-static inline struct kmem_cache *page_get_cache(struct page *page)
-{
- page = compound_head(page);
- BUG_ON(!PageSlab(page));
- return page->slab_cache;
-}
-
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
struct page *page = virt_to_head_page(obj);
@@ -585,9 +570,9 @@ static struct arraycache_init initarray_generic =
{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
/* internal cache of cache description objs */
-static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
-static struct kmem_cache cache_cache = {
- .nodelists = cache_cache_nodelists,
+static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES];
+static struct kmem_cache kmem_cache_boot = {
+ .nodelists = kmem_cache_nodelists,
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
@@ -810,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
}
+#if DEBUG
#define slab_error(cachep, msg) __slab_error(__func__, cachep, msg)
static void __slab_error(const char *function, struct kmem_cache *cachep,
@@ -818,7 +804,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
+ add_taint(TAINT_BAD_PAGE);
}
+#endif
/*
* By default on NUMA we use alien caches to stage the freeing of
@@ -900,7 +888,7 @@ static void __cpuinit start_cpu_timer(int cpu)
*/
if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
+ INIT_DEFERRABLE_WORK(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
__round_jiffies_relative(HZ, cpu));
}
@@ -1601,15 +1589,17 @@ void __init kmem_cache_init(void)
int order;
int node;
+ kmem_cache = &kmem_cache_boot;
+
if (num_possible_nodes() == 1)
use_alien_caches = 0;
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
if (i < MAX_NUMNODES)
- cache_cache.nodelists[i] = NULL;
+ kmem_cache->nodelists[i] = NULL;
}
- set_up_list3s(&cache_cache, CACHE_CACHE);
+ set_up_list3s(kmem_cache, CACHE_CACHE);
/*
* Fragmentation resistance on low memory - only use bigger
@@ -1621,9 +1611,9 @@ void __init kmem_cache_init(void)
/* Bootstrap is tricky, because several objects are allocated
* from caches that do not exist yet:
- * 1) initialize the cache_cache cache: it contains the struct
- * kmem_cache structures of all caches, except cache_cache itself:
- * cache_cache is statically allocated.
+ * 1) initialize the kmem_cache cache: it contains the struct
+ * kmem_cache structures of all caches, except kmem_cache itself:
+ * kmem_cache is statically allocated.
* Initially an __init data area is used for the head array and the
* kmem_list3 structures, it's replaced with a kmalloc allocated
* array at the end of the bootstrap.
@@ -1632,43 +1622,43 @@ void __init kmem_cache_init(void)
* An __init data area is used for the head array.
* 3) Create the remaining kmalloc caches, with minimally sized
* head arrays.
- * 4) Replace the __init data head arrays for cache_cache and the first
+ * 4) Replace the __init data head arrays for kmem_cache and the first
* kmalloc cache with kmalloc allocated arrays.
- * 5) Replace the __init data for kmem_list3 for cache_cache and
+ * 5) Replace the __init data for kmem_list3 for kmem_cache and
* the other cache's with kmalloc allocated memory.
* 6) Resize the head arrays of the kmalloc caches to their final sizes.
*/
node = numa_mem_id();
- /* 1) create the cache_cache */
+ /* 1) create the kmem_cache */
INIT_LIST_HEAD(&slab_caches);
- list_add(&cache_cache.list, &slab_caches);
- cache_cache.colour_off = cache_line_size();
- cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
- cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
+ list_add(&kmem_cache->list, &slab_caches);
+ kmem_cache->colour_off = cache_line_size();
+ kmem_cache->array[smp_processor_id()] = &initarray_cache.cache;
+ kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
nr_node_ids * sizeof(struct kmem_list3 *);
- cache_cache.object_size = cache_cache.size;
- cache_cache.size = ALIGN(cache_cache.size,
+ kmem_cache->object_size = kmem_cache->size;
+ kmem_cache->size = ALIGN(kmem_cache->object_size,
cache_line_size());
- cache_cache.reciprocal_buffer_size =
- reciprocal_value(cache_cache.size);
+ kmem_cache->reciprocal_buffer_size =
+ reciprocal_value(kmem_cache->size);
for (order = 0; order < MAX_ORDER; order++) {
- cache_estimate(order, cache_cache.size,
- cache_line_size(), 0, &left_over, &cache_cache.num);
- if (cache_cache.num)
+ cache_estimate(order, kmem_cache->size,
+ cache_line_size(), 0, &left_over, &kmem_cache->num);
+ if (kmem_cache->num)
break;
}
- BUG_ON(!cache_cache.num);
- cache_cache.gfporder = order;
- cache_cache.colour = left_over / cache_cache.colour_off;
- cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
+ BUG_ON(!kmem_cache->num);
+ kmem_cache->gfporder = order;
+ kmem_cache->colour = left_over / kmem_cache->colour_off;
+ kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
/* 2+3) create the kmalloc caches */
@@ -1681,19 +1671,22 @@ void __init kmem_cache_init(void)
* bug.
*/
- sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
- sizes[INDEX_AC].cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL);
+ sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+ sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name;
+ sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size;
+ sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size;
+ sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+ __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+ list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches);
if (INDEX_AC != INDEX_L3) {
- sizes[INDEX_L3].cs_cachep =
- __kmem_cache_create(names[INDEX_L3].name,
- sizes[INDEX_L3].cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL);
+ sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+ sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name;
+ sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size;
+ sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size;
+ sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+ __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+ list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches);
}
slab_early_init = 0;
@@ -1707,20 +1700,23 @@ void __init kmem_cache_init(void)
* allow tighter packing of the smaller caches.
*/
if (!sizes->cs_cachep) {
- sizes->cs_cachep = __kmem_cache_create(names->name,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_PANIC,
- NULL);
+ sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+ sizes->cs_cachep->name = names->name;
+ sizes->cs_cachep->size = sizes->cs_size;
+ sizes->cs_cachep->object_size = sizes->cs_size;
+ sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN;
+ __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC);
+ list_add(&sizes->cs_cachep->list, &slab_caches);
}
#ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = __kmem_cache_create(
- names->name_dma,
- sizes->cs_size,
- ARCH_KMALLOC_MINALIGN,
- ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
- SLAB_PANIC,
- NULL);
+ sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+ sizes->cs_dmacachep->name = names->name_dma;
+ sizes->cs_dmacachep->size = sizes->cs_size;
+ sizes->cs_dmacachep->object_size = sizes->cs_size;
+ sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN;
+ __kmem_cache_create(sizes->cs_dmacachep,
+ ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC);
+ list_add(&sizes->cs_dmacachep->list, &slab_caches);
#endif
sizes++;
names++;
@@ -1731,15 +1727,15 @@ void __init kmem_cache_init(void)
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
- BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
- memcpy(ptr, cpu_cache_get(&cache_cache),
+ BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache);
+ memcpy(ptr, cpu_cache_get(kmem_cache),
sizeof(struct arraycache_init));
/*
* Do not assume that spinlocks can be initialized via memcpy:
*/
spin_lock_init(&ptr->lock);
- cache_cache.array[smp_processor_id()] = ptr;
+ kmem_cache->array[smp_processor_id()] = ptr;
ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
@@ -1760,7 +1756,7 @@ void __init kmem_cache_init(void)
int nid;
for_each_online_node(nid) {
- init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
+ init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
init_list(malloc_sizes[INDEX_AC].cs_cachep,
&initkmem_list3[SIZE_AC + nid], nid);
@@ -1781,9 +1777,6 @@ void __init kmem_cache_init_late(void)
slab_state = UP;
- /* Annotate slab for lockdep -- annotate the malloc caches */
- init_lock_keys();
-
/* 6) resize the head arrays to their final sizes */
mutex_lock(&slab_mutex);
list_for_each_entry(cachep, &slab_caches, list)
@@ -1791,6 +1784,9 @@ void __init kmem_cache_init_late(void)
BUG();
mutex_unlock(&slab_mutex);
+ /* Annotate slab for lockdep -- annotate the malloc caches */
+ init_lock_keys();
+
/* Done! */
slab_state = FULL;
@@ -2209,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
}
}
-static void __kmem_cache_destroy(struct kmem_cache *cachep)
-{
- int i;
- struct kmem_list3 *l3;
-
- for_each_online_cpu(i)
- kfree(cachep->array[i]);
-
- /* NUMA: free the list3 structures */
- for_each_online_node(i) {
- l3 = cachep->nodelists[i];
- if (l3) {
- kfree(l3->shared);
- free_alien_cache(l3->alien);
- kfree(l3);
- }
- }
- kmem_cache_free(&cache_cache, cachep);
-}
-
-
/**
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
@@ -2366,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
* Cannot be called within a int, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
- * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting unloaded.
- *
* The flags are
*
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -2381,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-struct kmem_cache *
-__kmem_cache_create (const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+int
+__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
size_t left_over, slab_size, ralign;
- struct kmem_cache *cachep = NULL;
gfp_t gfp;
+ int err;
+ size_t size = cachep->size;
#if DEBUG
#if FORCED_DEBUG
@@ -2459,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
ralign = ARCH_SLAB_MINALIGN;
}
/* 3) caller mandated alignment */
- if (ralign < align) {
- ralign = align;
+ if (ralign < cachep->align) {
+ ralign = cachep->align;
}
/* disable debug if necessary */
if (ralign > __alignof__(unsigned long long))
@@ -2468,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
/*
* 4) Store it.
*/
- align = ralign;
+ cachep->align = ralign;
if (slab_is_available())
gfp = GFP_KERNEL;
else
gfp = GFP_NOWAIT;
- /* Get cache's description obj. */
- cachep = kmem_cache_zalloc(&cache_cache, gfp);
- if (!cachep)
- return NULL;
-
cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
- cachep->object_size = size;
- cachep->align = align;
#if DEBUG
/*
@@ -2506,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
+ && cachep->object_size > cache_line_size()
+ && ALIGN(size, cachep->align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
size = PAGE_SIZE;
}
#endif
@@ -2527,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
*/
flags |= CFLGS_OFF_SLAB;
- size = ALIGN(size, align);
+ size = ALIGN(size, cachep->align);
- left_over = calculate_slab_order(cachep, size, align, flags);
+ left_over = calculate_slab_order(cachep, size, cachep->align, flags);
+
+ if (!cachep->num)
+ return -E2BIG;
- if (!cachep->num) {
- printk(KERN_ERR
- "kmem_cache_create: couldn't create cache %s.\n", name);
- kmem_cache_free(&cache_cache, cachep);
- return NULL;
- }
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
- + sizeof(struct slab), align);
+ + sizeof(struct slab), cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -2566,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
cachep->colour_off = cache_line_size();
/* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < align)
- cachep->colour_off = align;
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
@@ -2588,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
- cachep->ctor = ctor;
- cachep->name = name;
- if (setup_cpu_cache(cachep, gfp)) {
- __kmem_cache_destroy(cachep);
- return NULL;
+ err = setup_cpu_cache(cachep, gfp);
+ if (err) {
+ __kmem_cache_shutdown(cachep);
+ return err;
}
if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2606,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align,
slab_set_debugobj_lock_classes(cachep);
}
- /* cache setup completed, link it into the list */
- list_add(&cachep->list, &slab_caches);
- return cachep;
+ return 0;
}
#if DEBUG
@@ -2767,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * kmem_cache_destroy - delete a cache
- * @cachep: the cache to destroy
- *
- * Remove a &struct kmem_cache object from the slab cache.
- *
- * It is expected this function will be called by a module when it is
- * unloaded. This will remove the cache completely, and avoid a duplicate
- * cache being allocated each time a module is loaded and unloaded, if the
- * module doesn't have persistent in-kernel storage across loads and unloads.
- *
- * The cache must be empty before calling this function.
- *
- * The caller must guarantee that no one will allocate memory from the cache
- * during the kmem_cache_destroy().
- */
-void kmem_cache_destroy(struct kmem_cache *cachep)
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
- BUG_ON(!cachep || in_interrupt());
+ int i;
+ struct kmem_list3 *l3;
+ int rc = __cache_shrink(cachep);
- /* Find the cache in the chain of caches. */
- get_online_cpus();
- mutex_lock(&slab_mutex);
- /*
- * the chain is never empty, cache_cache is never destroyed
- */
- list_del(&cachep->list);
- if (__cache_shrink(cachep)) {
- slab_error(cachep, "Can't free all objects");
- list_add(&cachep->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
- return;
- }
+ if (rc)
+ return rc;
- if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
- rcu_barrier();
+ for_each_online_cpu(i)
+ kfree(cachep->array[i]);
- __kmem_cache_destroy(cachep);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
+ /* NUMA: free the list3 structures */
+ for_each_online_node(i) {
+ l3 = cachep->nodelists[i];
+ if (l3) {
+ kfree(l3->shared);
+ free_alien_cache(l3->alien);
+ kfree(l3);
+ }
+ }
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
/*
* Get the memory for a slab management obj.
@@ -3098,7 +3038,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
- void *caller)
+ unsigned long caller)
{
struct page *page;
unsigned int objnr;
@@ -3118,7 +3058,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = caller;
+ *dbg_userword(cachep, objp) = (void *)caller;
objnr = obj_to_index(cachep, slabp, objp);
@@ -3131,7 +3071,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, (unsigned long)caller);
+ store_stackinfo(cachep, objp, caller);
kernel_map_pages(virt_to_page(objp),
cachep->size / PAGE_SIZE, 0);
} else {
@@ -3285,7 +3225,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
- gfp_t flags, void *objp, void *caller)
+ gfp_t flags, void *objp, unsigned long caller)
{
if (!objp)
return objp;
@@ -3302,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = caller;
+ *dbg_userword(cachep, objp) = (void *)caller;
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
@@ -3343,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
{
- if (cachep == &cache_cache)
+ if (cachep == kmem_cache)
return false;
return should_failslab(cachep->object_size, flags, cachep->flags);
@@ -3576,8 +3516,8 @@ done:
* Fallback to other node is possible if __GFP_THISNODE is not set.
*/
static __always_inline void *
-__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
- void *caller)
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+ unsigned long caller)
{
unsigned long save_flags;
void *ptr;
@@ -3663,7 +3603,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3799,7 +3739,7 @@ free_done:
* be in this state _before_ it is released. Called with disabled ints.
*/
static inline void __cache_free(struct kmem_cache *cachep, void *objp,
- void *caller)
+ unsigned long caller)
{
struct array_cache *ac = cpu_cache_get(cachep);
@@ -3839,7 +3779,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+ void *ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3850,14 +3790,14 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *
-kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
+kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;
- ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
+ ret = slab_alloc(cachep, flags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret,
- size, slab_buffer_size(cachep), flags);
+ size, cachep->size, flags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -3866,8 +3806,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = __cache_alloc_node(cachep, flags, nodeid,
- __builtin_return_address(0));
+ void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
@@ -3878,17 +3817,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_trace(size_t size,
- struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
gfp_t flags,
- int nodeid)
+ int nodeid,
+ size_t size)
{
void *ret;
- ret = __cache_alloc_node(cachep, flags, nodeid,
- __builtin_return_address(0));
+ ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+
trace_kmalloc_node(_RET_IP_, ret,
- size, slab_buffer_size(cachep),
+ size, cachep->size,
flags, nodeid);
return ret;
}
@@ -3896,34 +3835,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
-__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
+__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return kmem_cache_alloc_node_trace(size, cachep, flags, node);
+ return kmem_cache_alloc_node_trace(cachep, flags, node, size);
}
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
- return __do_kmalloc_node(size, flags, node,
- __builtin_return_address(0));
+ return __do_kmalloc_node(size, flags, node, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc_node);
void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
int node, unsigned long caller)
{
- return __do_kmalloc_node(size, flags, node, (void *)caller);
+ return __do_kmalloc_node(size, flags, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
#else
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
- return __do_kmalloc_node(size, flags, node, NULL);
+ return __do_kmalloc_node(size, flags, node, 0);
}
EXPORT_SYMBOL(__kmalloc_node);
#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
@@ -3936,7 +3874,7 @@ EXPORT_SYMBOL(__kmalloc_node);
* @caller: function caller for debug tracking of the caller
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
- void *caller)
+ unsigned long caller)
{
struct kmem_cache *cachep;
void *ret;
@@ -3949,9 +3887,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = __find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = __cache_alloc(cachep, flags, caller);
+ ret = slab_alloc(cachep, flags, caller);
- trace_kmalloc((unsigned long) caller, ret,
+ trace_kmalloc(caller, ret,
size, cachep->size, flags);
return ret;
@@ -3961,20 +3899,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
- return __do_kmalloc(size, flags, __builtin_return_address(0));
+ return __do_kmalloc(size, flags, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc);
void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
{
- return __do_kmalloc(size, flags, (void *)caller);
+ return __do_kmalloc(size, flags, caller);
}
EXPORT_SYMBOL(__kmalloc_track_caller);
#else
void *__kmalloc(size_t size, gfp_t flags)
{
- return __do_kmalloc(size, flags, NULL);
+ return __do_kmalloc(size, flags, 0);
}
EXPORT_SYMBOL(__kmalloc);
#endif
@@ -3995,7 +3933,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(objp, cachep->object_size);
- __cache_free(cachep, objp, __builtin_return_address(0));
+ __cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);
trace_kmem_cache_free(_RET_IP_, objp);
@@ -4026,7 +3964,7 @@ void kfree(const void *objp)
debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
- __cache_free(c, (void *)objp, __builtin_return_address(0));
+ __cache_free(c, (void *)objp, _RET_IP_);
local_irq_restore(flags);
}
EXPORT_SYMBOL(kfree);
diff --git a/mm/slab.h b/mm/slab.h
index db7848caaa25..7deeb449a301 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,9 +25,26 @@ extern enum slab_state slab_state;
/* The slab cache mutex protects the management structures during changes */
extern struct mutex slab_mutex;
+
+/* The list of all slab caches on the system */
extern struct list_head slab_caches;
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+/* The slab cache that manages slab cache information */
+extern struct kmem_cache *kmem_cache;
+
+/* Functions provided by the slab allocators */
+extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+
+#ifdef CONFIG_SLUB
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *));
+#else
+static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *))
+{ return NULL; }
+#endif
+
+
+int __kmem_cache_shutdown(struct kmem_cache *);
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index aa3ca5bb01b5..069a24e64403 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -22,6 +22,53 @@
enum slab_state slab_state;
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
+struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_DEBUG_VM
+static int kmem_cache_sanity_check(const char *name, size_t size)
+{
+ struct kmem_cache *s = NULL;
+
+ if (!name || in_interrupt() || size < sizeof(void *) ||
+ size > KMALLOC_MAX_SIZE) {
+ pr_err("kmem_cache_create(%s) integrity check failed\n", name);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(s, &slab_caches, list) {
+ char tmp;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ res = probe_kernel_address(s->name, tmp);
+ if (res) {
+ pr_err("Slab cache with size %d has lost its name\n",
+ s->object_size);
+ continue;
+ }
+
+ if (!strcmp(s->name, name)) {
+ pr_err("%s (%s): Cache name already exists.\n",
+ __func__, name);
+ dump_stack();
+ s = NULL;
+ return -EINVAL;
+ }
+ }
+
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+ return 0;
+}
+#else
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
+{
+ return 0;
+}
+#endif
/*
* kmem_cache_create - Create a cache.
@@ -52,68 +99,95 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s = NULL;
-
-#ifdef CONFIG_DEBUG_VM
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
- printk(KERN_ERR "kmem_cache_create(%s) integrity check"
- " failed\n", name);
- goto out;
- }
-#endif
+ int err = 0;
get_online_cpus();
mutex_lock(&slab_mutex);
-#ifdef CONFIG_DEBUG_VM
- list_for_each_entry(s, &slab_caches, list) {
- char tmp;
- int res;
+ if (!kmem_cache_sanity_check(name, size) == 0)
+ goto out_locked;
- /*
- * This happens when the module gets unloaded and doesn't
- * destroy its slab cache and no-one else reuses the vmalloc
- * area of the module. Print a warning.
- */
- res = probe_kernel_address(s->name, tmp);
- if (res) {
- printk(KERN_ERR
- "Slab cache with size %d has lost its name\n",
- s->object_size);
- continue;
- }
- if (!strcmp(s->name, name)) {
- printk(KERN_ERR "kmem_cache_create(%s): Cache name"
- " already exists.\n",
- name);
- dump_stack();
- s = NULL;
- goto oops;
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
+ if (s)
+ goto out_locked;
+
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (s) {
+ s->object_size = s->size = size;
+ s->align = align;
+ s->ctor = ctor;
+ s->name = kstrdup(name, GFP_KERNEL);
+ if (!s->name) {
+ kmem_cache_free(kmem_cache, s);
+ err = -ENOMEM;
+ goto out_locked;
}
- }
- WARN_ON(strchr(name, ' ')); /* It confuses parsers */
-#endif
+ err = __kmem_cache_create(s, flags);
+ if (!err) {
- s = __kmem_cache_create(name, size, align, flags, ctor);
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
-#ifdef CONFIG_DEBUG_VM
-oops:
-#endif
+ } else {
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ }
+ } else
+ err = -ENOMEM;
+
+out_locked:
mutex_unlock(&slab_mutex);
put_online_cpus();
-#ifdef CONFIG_DEBUG_VM
-out:
-#endif
- if (!s && (flags & SLAB_PANIC))
- panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+ if (err) {
+
+ if (flags & SLAB_PANIC)
+ panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
+ name, err);
+ else {
+ printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ name, err);
+ dump_stack();
+ }
+
+ return NULL;
+ }
return s;
}
EXPORT_SYMBOL(kmem_cache_create);
+void kmem_cache_destroy(struct kmem_cache *s)
+{
+ get_online_cpus();
+ mutex_lock(&slab_mutex);
+ s->refcount--;
+ if (!s->refcount) {
+ list_del(&s->list);
+
+ if (!__kmem_cache_shutdown(s)) {
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ } else {
+ list_add(&s->list, &slab_caches);
+ mutex_unlock(&slab_mutex);
+ printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
+ dump_stack();
+ }
+ } else {
+ mutex_unlock(&slab_mutex);
+ }
+ put_online_cpus();
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
int slab_is_available(void)
{
return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 45d4ca79933a..1e921c5e9576 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
void *page;
#ifdef CONFIG_NUMA
- if (node != -1)
+ if (node != NUMA_NO_NODE)
page = alloc_pages_exact_node(node, gfp, order);
else
#endif
@@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
* If there's a node specification, search for a partial
* page with a matching node id in the freelist.
*/
- if (node != -1 && page_to_nid(sp) != node)
+ if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
continue;
#endif
/* Enough room on this page? */
@@ -425,10 +425,11 @@ out:
* End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
*/
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+static __always_inline void *
+__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
*m = size;
ret = (void *)m + align;
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(caller, ret,
size, size + align, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
page->private = size;
}
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(caller, ret,
size, PAGE_SIZE << order, gfp, node);
}
kmemleak_alloc(ret, size, 1, gfp);
return ret;
}
+
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+ return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
EXPORT_SYMBOL(__kmalloc_node);
+#ifdef CONFIG_TRACING
+void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
+}
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
+ int node, unsigned long caller)
+{
+ return __do_kmalloc_node(size, gfp, node, caller);
+}
+#endif
+#endif
+
void kfree(const void *block)
{
struct page *sp;
@@ -481,7 +502,7 @@ void kfree(const void *block)
sp = virt_to_page(block);
if (PageSlab(sp)) {
- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else
@@ -500,7 +521,7 @@ size_t ksize(const void *block)
sp = virt_to_page(block);
if (PageSlab(sp)) {
- int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
} else
@@ -508,44 +529,24 @@ size_t ksize(const void *block)
}
EXPORT_SYMBOL(ksize);
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
{
- struct kmem_cache *c;
-
- c = slob_alloc(sizeof(struct kmem_cache),
- GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
+ size_t align = c->size;
- if (c) {
- c->name = name;
- c->size = size;
- if (flags & SLAB_DESTROY_BY_RCU) {
- /* leave room for rcu footer at the end of object */
- c->size += sizeof(struct slob_rcu);
- }
- c->flags = flags;
- c->ctor = ctor;
- /* ignore alignment unless it's forced */
- c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
- if (c->align < ARCH_SLAB_MINALIGN)
- c->align = ARCH_SLAB_MINALIGN;
- if (c->align < align)
- c->align = align;
-
- kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
- c->refcount = 1;
+ if (flags & SLAB_DESTROY_BY_RCU) {
+ /* leave room for rcu footer at the end of object */
+ c->size += sizeof(struct slob_rcu);
}
- return c;
-}
+ c->flags = flags;
+ /* ignore alignment unless it's forced */
+ c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+ if (c->align < ARCH_SLAB_MINALIGN)
+ c->align = ARCH_SLAB_MINALIGN;
+ if (c->align < align)
+ c->align = align;
-void kmem_cache_destroy(struct kmem_cache *c)
-{
- kmemleak_free(c);
- if (c->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
- slob_free(c, sizeof(struct kmem_cache));
+ return 0;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
@@ -613,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_size);
+int __kmem_cache_shutdown(struct kmem_cache *c)
+{
+ /* No way to check for remaining objects */
+ return 0;
+}
+
int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
EXPORT_SYMBOL(kmem_cache_shrink);
+struct kmem_cache kmem_cache_boot = {
+ .name = "kmem_cache",
+ .size = sizeof(struct kmem_cache),
+ .flags = SLAB_PANIC,
+ .align = ARCH_KMALLOC_MINALIGN,
+};
+
void __init kmem_cache_init(void)
{
+ kmem_cache = &kmem_cache_boot;
slab_state = UP;
}
diff --git a/mm/slub.c b/mm/slub.c
index 2fdd96f9e998..a0d698467f70 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *);
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s)
-{
- kfree(s->name);
- kfree(s);
-}
+static inline void sysfs_slab_remove(struct kmem_cache *s) { }
#endif
@@ -568,6 +564,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf);
printk(KERN_ERR "----------------------------------------"
"-------------------------------------\n\n");
+
+ add_taint(TAINT_BAD_PAGE);
}
static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -624,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
print_trailer(s, page, object);
}
-static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
+static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...)
{
va_list args;
char buf[100];
@@ -1069,13 +1067,13 @@ bad:
return 0;
}
-static noinline int free_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr)
+static noinline struct kmem_cache_node *free_debug_processing(
+ struct kmem_cache *s, struct page *page, void *object,
+ unsigned long addr, unsigned long *flags)
{
- unsigned long flags;
- int rc = 0;
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- local_irq_save(flags);
+ spin_lock_irqsave(&n->list_lock, *flags);
slab_lock(page);
if (!check_slab(s, page))
@@ -1113,15 +1111,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
init_object(s, object, SLUB_RED_INACTIVE);
- rc = 1;
out:
slab_unlock(page);
- local_irq_restore(flags);
- return rc;
+ /*
+ * Keep node_lock to preserve integrity
+ * until the object is actually freed
+ */
+ return n;
fail:
+ slab_unlock(page);
+ spin_unlock_irqrestore(&n->list_lock, *flags);
slab_fix(s, "Object at 0x%p not freed", object);
- goto out;
+ return NULL;
}
static int __init setup_slub_debug(char *str)
@@ -1214,8 +1216,9 @@ static inline void setup_object_debug(struct kmem_cache *s,
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
-static inline int free_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr) { return 0; }
+static inline struct kmem_cache_node *free_debug_processing(
+ struct kmem_cache *s, struct page *page, void *object,
+ unsigned long addr, unsigned long *flags) { return NULL; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
@@ -1714,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n,
stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
}
-void init_kmem_cache_cpus(struct kmem_cache *s)
+static void init_kmem_cache_cpus(struct kmem_cache *s)
{
int cpu;
@@ -1939,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s)
* If we did not find a slot then simply move all the partials to the
* per node partial list.
*/
-int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
{
struct page *oldpage;
int pages;
@@ -1962,6 +1965,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
local_irq_save(flags);
unfreeze_partials(s);
local_irq_restore(flags);
+ oldpage = NULL;
pobjects = 0;
pages = 0;
stat(s, CPU_PARTIAL_DRAIN);
@@ -2310,7 +2314,7 @@ new_slab:
*
* Otherwise we can simply pick the next object from the lockless free list.
*/
-static __always_inline void *slab_alloc(struct kmem_cache *s,
+static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
@@ -2380,9 +2384,15 @@ redo:
return object;
}
+static __always_inline void *slab_alloc(struct kmem_cache *s,
+ gfp_t gfpflags, unsigned long addr)
+{
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+}
+
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
@@ -2393,7 +2403,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
return ret;
}
@@ -2411,7 +2421,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -2425,7 +2435,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
@@ -2457,7 +2467,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
- if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
+ if (kmem_cache_debug(s) &&
+ !(n = free_debug_processing(s, page, x, addr, &flags)))
return;
do {
@@ -2612,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
page = virt_to_head_page(x);
+ if (kmem_cache_debug(s) && page->slab != s) {
+ pr_err("kmem_cache_free: Wrong slab cache. %s but object"
+ " is from %s\n", page->slab->name, s->name);
+ WARN_ON_ONCE(1);
+ return;
+ }
+
slab_free(s, page, x, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
@@ -3026,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
}
-static int kmem_cache_open(struct kmem_cache *s,
- const char *name, size_t size,
- size_t align, unsigned long flags,
- void (*ctor)(void *))
+static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
{
- memset(s, 0, kmem_size);
- s->name = name;
- s->ctor = ctor;
- s->object_size = size;
- s->align = align;
- s->flags = kmem_cache_flags(size, flags, name, ctor);
+ s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
s->reserved = 0;
if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3098,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s,
else
s->cpu_partial = 30;
- s->refcount = 1;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
@@ -3106,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s,
goto error;
if (alloc_kmem_cache_cpus(s))
- return 1;
+ return 0;
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%lu realsize=%u "
"order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)size, s->size, oo_order(s->oo),
+ s->name, (unsigned long)s->size, s->size, oo_order(s->oo),
s->offset, flags);
- return 0;
+ return -EINVAL;
}
/*
@@ -3137,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
sizeof(long), GFP_ATOMIC);
if (!map)
return;
- slab_err(s, page, "%s", text);
+ slab_err(s, page, text, s->name);
slab_lock(page);
get_map(s, page, map);
@@ -3169,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
discard_slab(s, page);
} else {
list_slab_objects(s, page,
- "Objects remaining on kmem_cache_close()");
+ "Objects remaining in %s on kmem_cache_close()");
}
}
}
@@ -3182,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s)
int node;
flush_all(s);
- free_percpu(s->cpu_slab);
/* Attempt to free all objects */
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3191,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
+ free_percpu(s->cpu_slab);
free_kmem_cache_nodes(s);
return 0;
}
-/*
- * Close a cache and release the kmem_cache structure
- * (must be used for caches created using kmem_cache_create)
- */
-void kmem_cache_destroy(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
{
- mutex_lock(&slab_mutex);
- s->refcount--;
- if (!s->refcount) {
- list_del(&s->list);
- mutex_unlock(&slab_mutex);
- if (kmem_cache_close(s)) {
- printk(KERN_ERR "SLUB %s: %s called for cache that "
- "still has objects.\n", s->name, __func__);
- dump_stack();
- }
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
+ int rc = kmem_cache_close(s);
+
+ if (!rc)
sysfs_slab_remove(s);
- } else
- mutex_unlock(&slab_mutex);
+
+ return rc;
}
-EXPORT_SYMBOL(kmem_cache_destroy);
/********************************************************************
* Kmalloc subsystem
@@ -3226,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy);
struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
EXPORT_SYMBOL(kmalloc_caches);
-static struct kmem_cache *kmem_cache;
-
#ifdef CONFIG_ZONE_DMA
static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
#endif
@@ -3273,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
{
struct kmem_cache *s;
- s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
+
+ s->name = name;
+ s->size = s->object_size = size;
+ s->align = ARCH_KMALLOC_MINALIGN;
/*
* This function is called with IRQs disabled during early-boot on
* single CPU so there's no need to take slab_mutex here.
*/
- if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
- flags, NULL))
+ if (kmem_cache_open(s, flags))
goto panic;
list_add(&s->list, &slab_caches);
@@ -3362,7 +3358,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
+ ret = slab_alloc(s, flags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3405,7 +3401,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, node, _RET_IP_);
+ ret = slab_alloc_node(s, flags, node, _RET_IP_);
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -3482,7 +3478,7 @@ void kfree(const void *x)
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
kmemleak_free(x);
- put_page(page);
+ __free_pages(page, compound_order(page));
return;
}
slab_free(page->slab, page, object, _RET_IP_);
@@ -3719,12 +3715,12 @@ void __init kmem_cache_init(void)
slub_max_order = 0;
kmem_size = offsetof(struct kmem_cache, node) +
- nr_node_ids * sizeof(struct kmem_cache_node *);
+ nr_node_ids * sizeof(struct kmem_cache_node *);
/* Allocate two kmem_caches from the page allocator */
kmalloc_size = ALIGN(kmem_size, cache_line_size());
order = get_order(2 * kmalloc_size);
- kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
+ kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order);
/*
* Must first have the slab cache available for the allocations of the
@@ -3733,9 +3729,10 @@ void __init kmem_cache_init(void)
*/
kmem_cache_node = (void *)kmem_cache + kmalloc_size;
- kmem_cache_open(kmem_cache_node, "kmem_cache_node",
- sizeof(struct kmem_cache_node),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kmem_cache_node->name = "kmem_cache_node";
+ kmem_cache_node->size = kmem_cache_node->object_size =
+ sizeof(struct kmem_cache_node);
+ kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
@@ -3743,8 +3740,10 @@ void __init kmem_cache_init(void)
slab_state = PARTIAL;
temp_kmem_cache = kmem_cache;
- kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kmem_cache->name = "kmem_cache";
+ kmem_cache->size = kmem_cache->object_size = kmem_size;
+ kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+
kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
memcpy(kmem_cache, temp_kmem_cache, kmem_size);
@@ -3933,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size,
return NULL;
}
-struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- char *n;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
@@ -3951,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
if (sysfs_slab_alias(s, name)) {
s->refcount--;
- return NULL;
+ s = NULL;
}
- return s;
}
- n = kstrdup(name, GFP_KERNEL);
- if (!n)
- return NULL;
+ return s;
+}
- s = kmalloc(kmem_size, GFP_KERNEL);
- if (s) {
- if (kmem_cache_open(s, n,
- size, align, flags, ctor)) {
- int r;
+int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
+{
+ int err;
- list_add(&s->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- r = sysfs_slab_add(s);
- mutex_lock(&slab_mutex);
+ err = kmem_cache_open(s, flags);
+ if (err)
+ return err;
- if (!r)
- return s;
+ mutex_unlock(&slab_mutex);
+ err = sysfs_slab_add(s);
+ mutex_lock(&slab_mutex);
- list_del(&s->list);
- kmem_cache_close(s);
- }
- kfree(s);
- }
- kfree(n);
- return NULL;
+ if (err)
+ kmem_cache_close(s);
+
+ return err;
}
#ifdef CONFIG_SMP
@@ -4033,7 +4024,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
+ ret = slab_alloc(s, gfpflags, caller);
/* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4063,7 +4054,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, node, caller);
+ ret = slab_alloc_node(s, gfpflags, node, caller);
/* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -5210,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return err;
}
-static void kmem_cache_release(struct kobject *kobj)
-{
- struct kmem_cache *s = to_slab(kobj);
-
- kfree(s->name);
- kfree(s);
-}
-
static const struct sysfs_ops slab_sysfs_ops = {
.show = slab_attr_show,
.store = slab_attr_store,
@@ -5225,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = {
static struct kobj_type slab_ktype = {
.sysfs_ops = &slab_sysfs_ops,
- .release = kmem_cache_release
};
static int uevent_filter(struct kset *kset, struct kobject *kobj)
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
}
EXPORT_SYMBOL(mark_page_accessed);
+/*
+ * Order of operations is important: flush the pagevec when it's already
+ * full, not when adding the last page, to make sure that last page is
+ * not added to the LRU directly when passed to this function. Because
+ * mark_page_accessed() (called after this when writing) only activates
+ * pages that are on the LRU, linear writes in subpage chunks would see
+ * every PAGEVEC_SIZE page activated, which is unexpected.
+ */
void __lru_cache_add(struct page *page, enum lru_list lru)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
page_cache_get(page);
- if (!pagevec_add(pvec, page))
+ if (!pagevec_space(pvec))
__pagevec_lru_add(pvec, lru);
+ pagevec_add(pvec, page);
put_cpu_var(lru_add_pvecs);
}
EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
SetPageLRU(page_tail);
- if (page_evictable(page_tail, NULL)) {
+ if (page_evictable(page_tail)) {
if (PageActive(page)) {
SetPageActive(page_tail);
active = 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 14e254c768fc..71cd288b2001 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1483,7 +1483,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
struct file *swap_file, *victim;
struct address_space *mapping;
struct inode *inode;
- char *pathname;
+ struct filename *pathname;
int oom_score_adj;
int i, type, prev;
int err;
@@ -1498,8 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (IS_ERR(pathname))
goto out;
- victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0);
- putname(pathname);
+ victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
err = PTR_ERR(victim);
if (IS_ERR(victim))
goto out;
@@ -1936,7 +1935,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
- char *name;
+ struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
int i;
@@ -1967,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
name = NULL;
goto bad_swap;
}
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
if (IS_ERR(swap_file)) {
error = PTR_ERR(swap_file);
swap_file = NULL;
@@ -2053,7 +2052,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
printk(KERN_INFO "Adding %uk swap on %s. "
"Priority:%d extents:%d across:%lluk %s%s%s\n",
- p->pages<<(PAGE_SHIFT-10), name, p->prio,
+ p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "",
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
cancel_dirty_page(page, PAGE_CACHE_SIZE);
- clear_page_mlock(page);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
- clear_page_mlock(page);
ret = remove_mapping(mapping, page);
return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (PageDirty(page))
goto failed;
- clear_page_mlock(page);
BUG_ON(page_has_private(page));
__delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/util.c b/mm/util.c
index 3a5278c08d76..c55e26b17d93 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len)
}
EXPORT_SYMBOL(memdup_user);
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+ gfp_t flags)
+{
+ void *ret;
+ size_t ks = 0;
+
+ if (p)
+ ks = ksize(p);
+
+ if (ks >= new_size)
+ return (void *)p;
+
+ ret = kmalloc_track_caller(new_size, flags);
+ if (ret && p)
+ memcpy(ret, p, ks);
+
+ return ret;
+}
+
/**
* __krealloc - like krealloc() but don't free @p.
* @p: object to reallocate memory for.
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user);
*/
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
{
- void *ret;
- size_t ks = 0;
-
if (unlikely(!new_size))
return ZERO_SIZE_PTR;
- if (p)
- ks = ksize(p);
+ return __do_krealloc(p, new_size, flags);
- if (ks >= new_size)
- return (void *)p;
-
- ret = kmalloc_track_caller(new_size, flags);
- if (ret && p)
- memcpy(ret, p, ks);
-
- return ret;
}
EXPORT_SYMBOL(__krealloc);
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
return ZERO_SIZE_PTR;
}
- ret = __krealloc(p, new_size, flags);
+ ret = __do_krealloc(p, new_size, flags);
if (ret && p != ret)
kfree(p);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
usize -= PAGE_SIZE;
} while (usize > 0);
- /* Prevent "things" like memory migration? VM_flags need a cleanup... */
- vma->vm_flags |= VM_RESERVED;
+ vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
return 0;
}
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
{
struct vm_struct *v = p;
- seq_printf(m, "0x%p-0x%p %7ld",
+ seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
redo:
ClearPageUnevictable(page);
- if (page_evictable(page, NULL)) {
+ if (page_evictable(page)) {
/*
* For evictable pages, we can use the cache.
* In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
* page is on unevictable list, it never be freed. To avoid that,
* check after we added it to the list, again.
*/
- if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+ if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
if (!isolate_lru_page(page)) {
put_page(page);
goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
static unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,
struct scan_control *sc,
+ enum ttu_flags ttu_flags,
unsigned long *ret_nr_dirty,
- unsigned long *ret_nr_writeback)
+ unsigned long *ret_nr_writeback,
+ bool force_reclaim)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
- enum page_references references;
struct address_space *mapping;
struct page *page;
int may_enter_fs;
+ enum page_references references = PAGEREF_RECLAIM_CLEAN;
cond_resched();
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
sc->nr_scanned++;
- if (unlikely(!page_evictable(page, NULL)))
+ if (unlikely(!page_evictable(page)))
goto cull_mlocked;
if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
wait_on_page_writeback(page);
}
- references = page_check_references(page, sc);
+ if (!force_reclaim)
+ references = page_check_references(page, sc);
+
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, TTU_UNMAP)) {
+ switch (try_to_unmap(page, ttu_flags)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
return nr_reclaimed;
}
+unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+ struct list_head *page_list)
+{
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_unmap = 1,
+ };
+ unsigned long ret, dummy1, dummy2;
+ struct page *page, *next;
+ LIST_HEAD(clean_pages);
+
+ list_for_each_entry_safe(page, next, page_list, lru) {
+ if (page_is_file_cache(page) && !PageDirty(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &clean_pages);
+ }
+ }
+
+ ret = shrink_page_list(&clean_pages, zone, &sc,
+ TTU_UNMAP|TTU_IGNORE_ACCESS,
+ &dummy1, &dummy2, true);
+ list_splice(&clean_pages, page_list);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
+ return ret;
+}
+
/*
* Attempt to remove the specified page from its LRU. Only take this page
* if it is of the appropriate PageActive status. Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if (!PageLRU(page))
return ret;
- /* Do not give back unevictable pages for compaction */
- if (PageUnevictable(page))
+ /* Compaction should not handle unevictable pages but CMA can do so */
+ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
return ret;
ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
VM_BUG_ON(PageLRU(page));
list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
+ if (unlikely(!page_evictable(page))) {
spin_unlock_irq(&zone->lru_lock);
putback_lru_page(page);
spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, zone, sc,
- &nr_dirty, &nr_writeback);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
+ &nr_dirty, &nr_writeback, false);
spin_lock_irq(&zone->lru_lock);
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
page = lru_to_page(&l_hold);
list_del(&page->lru);
- if (unlikely(!page_evictable(page, NULL))) {
+ if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
return false;
}
+#ifdef CONFIG_COMPACTION
+/*
+ * If compaction is deferred for sc->order then scale the number of pages
+ * reclaimed based on the number of consecutive allocation failures
+ */
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+ struct lruvec *lruvec, struct scan_control *sc)
+{
+ struct zone *zone = lruvec_zone(lruvec);
+
+ if (zone->compact_order_failed <= sc->order)
+ pages_for_compaction <<= zone->compact_defer_shift;
+ return pages_for_compaction;
+}
+#else
+static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
+ struct lruvec *lruvec, struct scan_control *sc)
+{
+ return pages_for_compaction;
+}
+#endif
+
/*
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
* inactive lists are large enough, continue reclaiming
*/
pages_for_compaction = (2UL << sc->order);
+
+ pages_for_compaction = scale_for_compaction(pages_for_compaction,
+ lruvec, sc);
inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
if (nr_swap_pages > 0)
inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+ /*
+ * Compaction records what page blocks it recently failed to
+ * isolate pages from and skips them in the future scanning.
+ * When kswapd is going to sleep, it is reasonable to assume
+ * that pages and compaction may succeed so reset the cache.
+ */
+ reset_isolation_suitable(pgdat);
+
if (!kthread_should_stop())
schedule();
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid)
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state == SYSTEM_BOOTING);
- printk("Failed to start kswapd on node %d\n",nid);
pgdat->kswapd = NULL;
- ret = -1;
+ pr_err("Failed to start kswapd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kswapd);
}
return ret;
}
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/*
* page_evictable - test whether a page is evictable
* @page: the page to test
- * @vma: the VMA in which the page is or will be mapped, may be NULL
*
* Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list. The vma argument is !NULL when called from the
- * fault path to determine how to instantate a new page.
+ * lists vs unevictable list.
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
*
*/
-int page_evictable(struct page *page, struct vm_area_struct *vma)
+int page_evictable(struct page *page)
{
-
- if (mapping_unevictable(page_mapping(page)))
- return 0;
-
- if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
- return 0;
-
- return 1;
+ return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
}
#ifdef CONFIG_SHMEM
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
if (!PageLRU(page) || !PageUnevictable(page))
continue;
- if (page_evictable(page, NULL)) {
+ if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df7a6748231d..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
atomic_long_add(global_diff[i], &vm_stat[i]);
}
+void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
+{
+ int i;
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ if (pset->vm_stat_diff[i]) {
+ int v = pset->vm_stat_diff[i];
+ pset->vm_stat_diff[i] = 0;
+ atomic_long_add(v, &zone->vm_stat[i]);
+ atomic_long_add(v, &vm_stat[i]);
+ }
+}
#endif
#ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
"numa_other",
#endif
"nr_anon_transparent_hugepages",
+ "nr_free_cma",
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
"unevictable_pgs_munlocked",
"unevictable_pgs_cleared",
"unevictable_pgs_stranded",
- "unevictable_pgs_mlockfreed",
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"thp_fault_alloc",
@@ -1157,7 +1169,7 @@ static void __cpuinit start_cpu_timer(int cpu)
{
struct delayed_work *work = &per_cpu(vmstat_work, cpu);
- INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+ INIT_DEFERRABLE_WORK(work, vmstat_update);
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}