summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile6
-rw-r--r--mm/compaction.c84
-rw-r--r--mm/early_ioremap.c245
-rw-r--r--mm/filemap.c430
-rw-r--r--mm/huge_memory.c21
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/internal.h16
-rw-r--r--mm/iov_iter.c224
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c453
-rw-r--r--mm/memory.c147
-rw-r--r--mm/mempolicy.c46
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c55
-rw-r--r--mm/mprotect.c56
-rw-r--r--mm/nommu.c49
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c118
-rw-r--r--mm/process_vm_access.c250
-rw-r--r--mm/readahead.c21
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c86
-rw-r--r--mm/slab.c191
-rw-r--r--mm/slab.h21
-rw-r--r--mm/slab_common.c250
-rw-r--r--mm/slob.c10
-rw-r--r--mm/slub.c92
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/util.c53
-rw-r--r--mm/vmacache.c112
-rw-r--r--mm/vmalloc.c10
-rw-r--r--mm/vmscan.c28
-rw-r--r--mm/vmstat.c6
-rw-r--r--mm/zsmalloc.c17
-rw-r--r--mm/zswap.c86
37 files changed, 1916 insertions, 1341 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 2888024e0b0a..ebe5880c29d6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -216,6 +216,7 @@ config PAGEFLAGS_EXTENDED
#
config SPLIT_PTLOCK_CPUS
int
+ default "999999" if !MMU
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
default "4"
@@ -577,3 +578,6 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+
+config GENERIC_EARLY_IOREMAP
+ bool
diff --git a/mm/Makefile b/mm/Makefile
index cdd741519ee0..b484452dac57 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,8 +16,9 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o \
- interval_tree.o list_lru.o workingset.o $(mmu-y)
+ compaction.o balloon_compaction.o vmacache.o \
+ interval_tree.o list_lru.o workingset.o \
+ iov_iter.o $(mmu-y)
obj-y += init-mm.o
@@ -61,3 +62,4 @@ obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
+obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
diff --git a/mm/compaction.c b/mm/compaction.c
index b6ab77160068..37f976287068 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -217,21 +217,12 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
- int migratetype = get_pageblock_migratetype(page);
-
- /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
- if (migratetype == MIGRATE_RESERVE)
- return false;
-
- if (is_migrate_isolate(migratetype))
- return false;
-
- /* If the page is a large free page, then allow migration */
+ /* If the page is a large free page, then disallow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return true;
+ return false;
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(migratetype))
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
return true;
/* Otherwise skip the block */
@@ -253,6 +244,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
struct page *cursor, *valid_page = NULL;
unsigned long flags;
bool locked = false;
+ bool checked_pageblock = false;
cursor = pfn_to_page(blockpfn);
@@ -284,8 +276,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
/* Recheck this is a suitable migration target under lock */
- if (!strict && !suitable_migration_target(page))
- break;
+ if (!strict && !checked_pageblock) {
+ /*
+ * We need to check suitability of pageblock only once
+ * and this isolate_freepages_block() is called with
+ * pageblock range, so just check once is sufficient.
+ */
+ checked_pageblock = true;
+ if (!suitable_migration_target(page))
+ break;
+ }
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -460,12 +460,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
- isolate_mode_t mode = 0;
struct lruvec *lruvec;
unsigned long flags;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
bool skipped_async_unsuitable = false;
+ const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) |
+ (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -487,7 +488,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
cond_resched();
for (; low_pfn < end_pfn; low_pfn++) {
/* give a chance to irqs before checking need_resched() */
- if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
if (should_release_lock(&zone->lru_lock)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
locked = false;
@@ -526,8 +527,25 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* If isolation recently failed, do not retry */
pageblock_nr = low_pfn >> pageblock_order;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ if (last_pageblock_nr != pageblock_nr) {
+ int mt;
+
+ last_pageblock_nr = pageblock_nr;
+ if (!isolation_suitable(cc, page))
+ goto next_pageblock;
+
+ /*
+ * For async migration, also only scan in MOVABLE
+ * blocks. Async migration is optimistic to see if
+ * the minimum amount of work satisfies the allocation
+ */
+ mt = get_pageblock_migratetype(page);
+ if (!cc->sync && !migrate_async_suitable(mt)) {
+ cc->finished_update_migrate = true;
+ skipped_async_unsuitable = true;
+ goto next_pageblock;
+ }
+ }
/*
* Skip if free. page_order cannot be used without zone->lock
@@ -537,18 +555,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
/*
- * For async migration, also only scan in MOVABLE blocks. Async
- * migration is optimistic to see if the minimum amount of work
- * satisfies the allocation
- */
- if (!cc->sync && last_pageblock_nr != pageblock_nr &&
- !migrate_async_suitable(get_pageblock_migratetype(page))) {
- cc->finished_update_migrate = true;
- skipped_async_unsuitable = true;
- goto next_pageblock;
- }
-
- /*
* Check may be lockless but that's ok as we recheck later.
* It's possible to migrate LRU pages and balloon pages
* Skip any other type of page
@@ -557,11 +563,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
if (unlikely(balloon_page_movable(page))) {
if (locked && balloon_page_isolate(page)) {
/* Successfully isolated */
- cc->finished_update_migrate = true;
- list_add(&page->lru, migratelist);
- cc->nr_migratepages++;
- nr_isolated++;
- goto check_compact_cluster;
+ goto isolate_success;
}
}
continue;
@@ -607,12 +609,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
continue;
}
- if (!cc->sync)
- mode |= ISOLATE_ASYNC_MIGRATE;
-
- if (unevictable)
- mode |= ISOLATE_UNEVICTABLE;
-
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
@@ -622,13 +618,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* Successfully isolated */
- cc->finished_update_migrate = true;
del_page_from_lru_list(page, lruvec, page_lru(page));
+
+isolate_success:
+ cc->finished_update_migrate = true;
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
-check_compact_cluster:
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
@@ -639,7 +636,6 @@ check_compact_cluster:
next_pageblock:
low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
- last_pageblock_nr = pageblock_nr;
}
acct_isolated(zone, locked, cc);
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
new file mode 100644
index 000000000000..e10ccd299d66
--- /dev/null
+++ b/mm/early_ioremap.c
@@ -0,0 +1,245 @@
+/*
+ * Provide common bits of early_ioremap() support for architectures needing
+ * temporary mappings during boot before ioremap() is available.
+ *
+ * This is mostly a direct copy of the x86 early_ioremap implementation.
+ *
+ * (C) Copyright 1995 1996, 2014 Linus Torvalds
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/fixmap.h>
+
+#ifdef CONFIG_MMU
+static int early_ioremap_debug __initdata;
+
+static int __init early_ioremap_debug_setup(char *str)
+{
+ early_ioremap_debug = 1;
+
+ return 0;
+}
+early_param("early_ioremap_debug", early_ioremap_debug_setup);
+
+static int after_paging_init __initdata;
+
+void __init __weak early_ioremap_shutdown(void)
+{
+}
+
+void __init early_ioremap_reset(void)
+{
+ early_ioremap_shutdown();
+ after_paging_init = 1;
+}
+
+/*
+ * Generally, ioremap() is available after paging_init() has been called.
+ * Architectures wanting to allow early_ioremap after paging_init() can
+ * define __late_set_fixmap and __late_clear_fixmap to do the right thing.
+ */
+#ifndef __late_set_fixmap
+static inline void __init __late_set_fixmap(enum fixed_addresses idx,
+ phys_addr_t phys, pgprot_t prot)
+{
+ BUG();
+}
+#endif
+
+#ifndef __late_clear_fixmap
+static inline void __init __late_clear_fixmap(enum fixed_addresses idx)
+{
+ BUG();
+}
+#endif
+
+static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
+void __init early_ioremap_setup(void)
+{
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (WARN_ON(prev_map[i]))
+ break;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+}
+
+static int __init check_early_ioremap_leak(void)
+{
+ int count = 0;
+ int i;
+
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+ if (prev_map[i])
+ count++;
+
+ if (WARN(count, KERN_WARNING
+ "Debug warning: early ioremap leak of %d areas detected.\n"
+ "please boot with early_ioremap_debug and report the dmesg.\n",
+ count))
+ return 1;
+ return 0;
+}
+late_initcall(check_early_ioremap_leak);
+
+static void __init __iomem *
+__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
+{
+ unsigned long offset;
+ resource_size_t last_addr;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ WARN_ON(system_state != SYSTEM_BOOTING);
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (!prev_map[i]) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
+ __func__, (u64)phys_addr, size))
+ return NULL;
+
+ /* Don't allow wraparound or zero size */
+ last_addr = phys_addr + size - 1;
+ if (WARN_ON(!size || last_addr < phys_addr))
+ return NULL;
+
+ prev_size[slot] = size;
+ /*
+ * Mappings have to be page-aligned
+ */
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
+ size = PAGE_ALIGN(last_addr + 1) - phys_addr;
+
+ /*
+ * Mappings have to fit in the FIX_BTMAP area.
+ */
+ nrpages = size >> PAGE_SHIFT;
+ if (WARN_ON(nrpages > NR_FIX_BTMAPS))
+ return NULL;
+
+ /*
+ * Ok, go for it..
+ */
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_set_fixmap(idx, phys_addr, prot);
+ else
+ __early_set_fixmap(idx, phys_addr, prot);
+ phys_addr += PAGE_SIZE;
+ --idx;
+ --nrpages;
+ }
+ WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+
+ prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
+ return prev_map[slot];
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+ unsigned long virt_addr;
+ unsigned long offset;
+ unsigned int nrpages;
+ enum fixed_addresses idx;
+ int i, slot;
+
+ slot = -1;
+ for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
+ if (prev_map[i] == addr) {
+ slot = i;
+ break;
+ }
+ }
+
+ if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
+ addr, size))
+ return;
+
+ if (WARN(prev_size[slot] != size,
+ "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
+ addr, size, slot, prev_size[slot]))
+ return;
+
+ WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
+ addr, size, slot);
+
+ virt_addr = (unsigned long)addr;
+ if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
+ return;
+
+ offset = virt_addr & ~PAGE_MASK;
+ nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
+
+ idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
+ while (nrpages > 0) {
+ if (after_paging_init)
+ __late_clear_fixmap(idx);
+ else
+ __early_set_fixmap(idx, 0, FIXMAP_PAGE_CLEAR);
+ --idx;
+ --nrpages;
+ }
+ prev_map[slot] = NULL;
+}
+
+/* Remap an IO device */
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return __early_ioremap(phys_addr, size, FIXMAP_PAGE_IO);
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void *)__early_ioremap(phys_addr, size,
+ FIXMAP_PAGE_NORMAL);
+}
+#else /* CONFIG_MMU */
+
+void __init __iomem *
+early_ioremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (__force void __iomem *)phys_addr;
+}
+
+/* Remap memory */
+void __init *
+early_memremap(resource_size_t phys_addr, unsigned long size)
+{
+ return (void *)phys_addr;
+}
+
+void __init early_iounmap(void __iomem *addr, unsigned long size)
+{
+}
+
+#endif /* CONFIG_MMU */
+
+
+void __init early_memunmap(void *addr, unsigned long size)
+{
+ early_iounmap((__force void __iomem *)addr, size);
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index 21781f1fe52b..a82fbe4c9e8e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/rmap.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -76,7 +77,7 @@
* ->mmap_sem
* ->lock_page (access_process_vm)
*
- * ->i_mutex (generic_file_buffered_write)
+ * ->i_mutex (generic_perform_write)
* ->mmap_sem (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
@@ -562,7 +563,7 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
return error;
@@ -1427,7 +1428,8 @@ static void shrink_readahead_size_eio(struct file *filp,
* do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
- * @desc: read_descriptor
+ * @iter: data destination
+ * @written: already copied
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1435,8 +1437,8 @@ static void shrink_readahead_size_eio(struct file *filp,
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-static void do_generic_file_read(struct file *filp, loff_t *ppos,
- read_descriptor_t *desc)
+static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
+ struct iov_iter *iter, ssize_t written)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
@@ -1446,12 +1448,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
- int error;
+ int error = 0;
index = *ppos >> PAGE_CACHE_SHIFT;
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
- last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+ last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
for (;;) {
@@ -1486,7 +1488,7 @@ find_page:
if (!page->mapping)
goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
- desc, offset))
+ offset, iter->count))
goto page_not_up_to_date_locked;
unlock_page(page);
}
@@ -1536,24 +1538,23 @@ page_ok:
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
- *
- * The file_read_actor routine returns how many bytes were
- * actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
*/
- ret = file_read_actor(desc, page, offset, nr);
+
+ ret = copy_page_to_iter(page, offset, nr, iter);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
page_cache_release(page);
- if (ret == nr && desc->count)
- continue;
- goto out;
+ written += ret;
+ if (!iov_iter_count(iter))
+ goto out;
+ if (ret < nr) {
+ error = -EFAULT;
+ goto out;
+ }
+ continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
@@ -1588,6 +1589,7 @@ readpage:
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
+ error = 0;
goto find_page;
}
goto readpage_error;
@@ -1618,7 +1620,6 @@ readpage:
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
- desc->error = error;
page_cache_release(page);
goto out;
@@ -1629,16 +1630,17 @@ no_cached_page:
*/
page = page_cache_alloc_cold(mapping);
if (!page) {
- desc->error = -ENOMEM;
+ error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping,
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
- if (error == -EEXIST)
+ if (error == -EEXIST) {
+ error = 0;
goto find_page;
- desc->error = error;
+ }
goto out;
}
goto readpage;
@@ -1651,44 +1653,7 @@ out:
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
file_accessed(filp);
-}
-
-int file_read_actor(read_descriptor_t *desc, struct page *page,
- unsigned long offset, unsigned long size)
-{
- char *kaddr;
- unsigned long left, count = desc->count;
-
- if (size > count)
- size = count;
-
- /*
- * Faults on the destination of a read are common, so do it before
- * taking the kmap.
- */
- if (!fault_in_pages_writeable(desc->arg.buf, size)) {
- kaddr = kmap_atomic(page);
- left = __copy_to_user_inatomic(desc->arg.buf,
- kaddr + offset, size);
- kunmap_atomic(kaddr);
- if (left == 0)
- goto success;
- }
-
- /* Do it the slow way */
- kaddr = kmap(page);
- left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
- kunmap(page);
-
- if (left) {
- size -= left;
- desc->error = -EFAULT;
- }
-success:
- desc->count = count - size;
- desc->written += size;
- desc->arg.buf += size;
- return size;
+ return written ? written : error;
}
/*
@@ -1746,14 +1711,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
- unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
+ struct iov_iter i;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
+ iov_iter_init(&i, iov, nr_segs, count, 0);
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
@@ -1775,6 +1741,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (retval > 0) {
*ppos = pos + retval;
count -= retval;
+ /*
+ * If we did a short DIO read we need to skip the
+ * section of the iov that we've already read data into.
+ */
+ iov_iter_advance(&i, retval);
}
/*
@@ -1791,39 +1762,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
}
}
- count = retval;
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
- loff_t offset = 0;
-
- /*
- * If we did a short DIO read we need to skip the section of the
- * iov that we've already read data into.
- */
- if (count) {
- if (count > iov[seg].iov_len) {
- count -= iov[seg].iov_len;
- continue;
- }
- offset = count;
- count = 0;
- }
-
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base + offset;
- desc.count = iov[seg].iov_len - offset;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_generic_file_read(filp, ppos, &desc);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
- if (desc.count > 0)
- break;
- }
+ retval = do_generic_file_read(filp, ppos, &i, retval);
out:
return retval;
}
@@ -1952,11 +1891,11 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
- pgoff_t size;
+ loff_t size;
int ret = 0;
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (offset >= size)
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (offset >= size >> PAGE_CACHE_SHIFT)
return VM_FAULT_SIGBUS;
/*
@@ -2005,8 +1944,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (unlikely(offset >= size)) {
+ size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
+ if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
@@ -2064,6 +2003,78 @@ page_not_uptodate:
}
EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ loff_t size;
+ struct page *page;
+ unsigned long address = (unsigned long) vmf->virtual_address;
+ unsigned long addr;
+ pte_t *pte;
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) {
+ if (iter.index > vmf->max_pgoff)
+ break;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ goto next;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page))
+ break;
+ else
+ goto next;
+ }
+
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+
+ if (!PageUptodate(page) ||
+ PageReadahead(page) ||
+ PageHWPoison(page))
+ goto skip;
+ if (!trylock_page(page))
+ goto skip;
+
+ if (page->mapping != mapping || !PageUptodate(page))
+ goto unlock;
+
+ size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
+ if (page->index >= size >> PAGE_CACHE_SHIFT)
+ goto unlock;
+
+ pte = vmf->pte + page->index - vmf->pgoff;
+ if (!pte_none(*pte))
+ goto unlock;
+
+ if (file->f_ra.mmap_miss > 0)
+ file->f_ra.mmap_miss--;
+ addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
+ do_set_pte(vma, addr, page, pte, false, false);
+ unlock_page(page);
+ goto next;
+unlock:
+ unlock_page(page);
+skip:
+ page_cache_release(page);
+next:
+ if (iter.index == vmf->max_pgoff)
+ break;
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
@@ -2093,6 +2104,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -2261,150 +2273,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page_gfp);
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
- const struct iovec *iov, size_t base, size_t bytes)
-{
- size_t copied = 0, left = 0;
-
- while (bytes) {
- char __user *buf = iov->iov_base + base;
- int copy = min(bytes, iov->iov_len - base);
-
- base = 0;
- left = __copy_from_user_inatomic(vaddr, buf, copy);
- copied += copy;
- bytes -= copy;
- vaddr += copy;
- iov++;
-
- if (unlikely(left))
- break;
- }
- return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied. If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- BUG_ON(!in_atomic());
- kaddr = kmap_atomic(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap_atomic(kaddr);
-
- return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
- struct iov_iter *i, unsigned long offset, size_t bytes)
-{
- char *kaddr;
- size_t copied;
-
- kaddr = kmap(page);
- if (likely(i->nr_segs == 1)) {
- int left;
- char __user *buf = i->iov->iov_base + i->iov_offset;
- left = __copy_from_user(kaddr + offset, buf, bytes);
- copied = bytes - left;
- } else {
- copied = __iovec_copy_from_user_inatomic(kaddr + offset,
- i->iov, i->iov_offset, bytes);
- }
- kunmap(page);
- return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
- BUG_ON(i->count < bytes);
-
- if (likely(i->nr_segs == 1)) {
- i->iov_offset += bytes;
- i->count -= bytes;
- } else {
- const struct iovec *iov = i->iov;
- size_t base = i->iov_offset;
- unsigned long nr_segs = i->nr_segs;
-
- /*
- * The !iov->iov_len check ensures we skip over unlikely
- * zero-length segments (without overruning the iovec).
- */
- while (bytes || unlikely(i->count && !iov->iov_len)) {
- int copy;
-
- copy = min(bytes, iov->iov_len - base);
- BUG_ON(!i->count || i->count < copy);
- i->count -= copy;
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- nr_segs--;
- base = 0;
- }
- }
- i->iov = iov;
- i->iov_offset = base;
- i->nr_segs = nr_segs;
- }
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
- char __user *buf = i->iov->iov_base + i->iov_offset;
- bytes = min(bytes, i->iov->iov_len - i->iov_offset);
- return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
- const struct iovec *iov = i->iov;
- if (i->nr_segs == 1)
- return i->count;
- else
- return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
/*
* Performs necessary checks before doing a write
*
@@ -2511,7 +2379,7 @@ EXPORT_SYMBOL(pagecache_write_end);
ssize_t
generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+ unsigned long *nr_segs, loff_t pos,
size_t count, size_t ocount)
{
struct file *file = iocb->ki_filp;
@@ -2572,7 +2440,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
- *ppos = pos;
+ iocb->ki_pos = pos;
}
out:
return written;
@@ -2618,7 +2486,7 @@ found:
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-static ssize_t generic_perform_write(struct file *file,
+ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
@@ -2668,9 +2536,7 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
mark_page_accessed(page);
@@ -2708,27 +2574,7 @@ again:
return written ? written : status;
}
-
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, loff_t *ppos,
- size_t count, ssize_t written)
-{
- struct file *file = iocb->ki_filp;
- ssize_t status;
- struct iov_iter i;
-
- iov_iter_init(&i, iov, nr_segs, count, written);
- status = generic_perform_write(file, &i, pos);
-
- if (likely(status >= 0)) {
- written += status;
- *ppos = pos + status;
- }
-
- return written ? written : status;
-}
-EXPORT_SYMBOL(generic_file_buffered_write);
+EXPORT_SYMBOL(generic_perform_write);
/**
* __generic_file_aio_write - write data to a file
@@ -2750,16 +2596,18 @@ EXPORT_SYMBOL(generic_file_buffered_write);
* avoid syncing under i_mutex.
*/
ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+ unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
- loff_t pos;
- ssize_t written;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
ssize_t err;
+ ssize_t status;
+ struct iov_iter from;
ocount = 0;
err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
@@ -2767,12 +2615,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
return err;
count = ocount;
- pos = *ppos;
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
- written = 0;
-
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
@@ -2788,45 +2633,47 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err)
goto out;
+ iov_iter_init(&from, iov, nr_segs, count, 0);
+
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (unlikely(file->f_flags & O_DIRECT)) {
loff_t endbyte;
- ssize_t written_buffered;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
- ppos, count, ocount);
+ written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos,
+ count, ocount);
if (written < 0 || written == count)
goto out;
+ iov_iter_advance(&from, written);
+
/*
* direct-io write to a hole: fall through to buffered I/O
* for completing the rest of the request.
*/
pos += written;
count -= written;
- written_buffered = generic_file_buffered_write(iocb, iov,
- nr_segs, pos, ppos, count,
- written);
+
+ status = generic_perform_write(file, &from, pos);
/*
- * If generic_file_buffered_write() retuned a synchronous error
+ * If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
* direct-written, or the error code if that was zero. Note
* that this differs from normal direct-io semantics, which
* will return -EFOO even if some bytes were written.
*/
- if (written_buffered < 0) {
- err = written_buffered;
+ if (unlikely(status < 0) && !written) {
+ err = status;
goto out;
}
-
+ iocb->ki_pos = pos + status;
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
- endbyte = pos + written_buffered - written - 1;
+ endbyte = pos + status - 1;
err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
if (err == 0) {
- written = written_buffered;
+ written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
@@ -2837,8 +2684,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
*/
}
} else {
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, ppos, count, written);
+ written = generic_perform_write(file, &from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
}
out:
current->backing_dev_info = NULL;
@@ -2867,7 +2715,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ ret = __generic_file_aio_write(iocb, iov, nr_segs);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ac89e9f82ef..64635f5278ff 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -827,7 +827,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -968,7 +968,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
__GFP_OTHER_NODE,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
- mem_cgroup_newpage_charge(pages[i], mm,
+ mem_cgroup_charge_anon(pages[i], mm,
GFP_KERNEL))) {
if (pages[i])
put_page(pages[i]);
@@ -1101,7 +1101,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -1891,17 +1891,22 @@ out:
int hugepage_madvise(struct vm_area_struct *vma,
unsigned long *vm_flags, int advice)
{
- struct mm_struct *mm = vma->vm_mm;
-
switch (advice) {
case MADV_HUGEPAGE:
+#ifdef CONFIG_S390
+ /*
+ * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
+ * can't handle this properly after s390_enable_sie, so we simply
+ * ignore the madvise to prevent qemu from causing a SIGSEGV.
+ */
+ if (mm_has_pgste(vma->vm_mm))
+ return 0;
+#endif
/*
* Be somewhat over-protective like KSM for now!
*/
if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
return -EINVAL;
- if (mm->def_flags & VM_NOHUGEPAGE)
- return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
/*
@@ -2354,7 +2359,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!new_page)
return;
- if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+ if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)))
return;
/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c02b9dadfb0..dd30f22b35e0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
+#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/bootmem.h>
@@ -1535,6 +1536,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, nodes_allowed, 0))
break;
+ cond_resched_lock(&hugetlb_lock);
}
while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, 1))
@@ -2690,7 +2692,8 @@ retry_avoidcopy:
BUG_ON(huge_pte_none(pte));
spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(pte_same(huge_ptep_get(ptep), pte)))
+ if (likely(ptep &&
+ pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
* race occurs while re-acquiring page table
@@ -2734,7 +2737,7 @@ retry_avoidcopy:
*/
spin_lock(ptl);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
- if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+ if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
/* Break COW */
@@ -2896,8 +2899,7 @@ retry:
if (anon_rmap) {
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
- }
- else
+ } else
page_dup_rmap(page);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
@@ -3185,6 +3187,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
+ mmu_notifier_invalidate_range_start(mm, start, end);
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -3214,6 +3217,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ mmu_notifier_invalidate_range_end(mm, start, end);
return pages << h->order;
}
@@ -3518,7 +3522,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/* Can be overriden by architectures */
-__attribute__((weak)) struct page *
+struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write)
{
diff --git a/mm/internal.h b/mm/internal.h
index 29e1e761f9eb..07b67361a40a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,7 @@
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
+#include <linux/fs.h>
#include <linux/mm.h>
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
@@ -21,6 +22,20 @@ static inline void set_page_count(struct page *page, int v)
atomic_set(&page->_count, v);
}
+extern int __do_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+ unsigned long lookahead_size);
+
+/*
+ * Submit IO for the read-ahead request in file_ra_state.
+ */
+static inline unsigned long ra_submit(struct file_ra_state *ra,
+ struct address_space *mapping, struct file *filp)
+{
+ return __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
+}
+
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
@@ -370,5 +385,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
new file mode 100644
index 000000000000..10e46cd721de
--- /dev/null
+++ b/mm/iov_iter.c
@@ -0,0 +1,224 @@
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/pagemap.h>
+
+size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+ struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+ void *kaddr, *from;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ if (!fault_in_pages_writeable(buf, copy)) {
+ kaddr = kmap_atomic(page);
+ from = kaddr + offset;
+
+ /* first chunk, usually the only one */
+ left = __copy_to_user_inatomic(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user_inatomic(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ if (likely(!bytes)) {
+ kunmap_atomic(kaddr);
+ goto done;
+ }
+ offset = from - kaddr;
+ buf += copy;
+ kunmap_atomic(kaddr);
+ copy = min(bytes, iov->iov_len - skip);
+ }
+ /* Too bad - revert to non-atomic kmap */
+ kaddr = kmap(page);
+ from = kaddr + offset;
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+ kunmap(page);
+done:
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+EXPORT_SYMBOL(copy_page_to_iter);
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+ const struct iovec *iov, size_t base, size_t bytes)
+{
+ size_t copied = 0, left = 0;
+
+ while (bytes) {
+ char __user *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+
+ base = 0;
+ left = __copy_from_user_inatomic(vaddr, buf, copy);
+ copied += copy;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+
+ if (unlikely(left))
+ break;
+ }
+ return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were successfully copied. If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap_atomic(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap_atomic(kaddr);
+
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+ struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+ char *kaddr;
+ size_t copied;
+
+ kaddr = kmap(page);
+ if (likely(i->nr_segs == 1)) {
+ int left;
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ left = __copy_from_user(kaddr + offset, buf, bytes);
+ copied = bytes - left;
+ } else {
+ copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+ i->iov, i->iov_offset, bytes);
+ }
+ kunmap(page);
+ return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+ BUG_ON(i->count < bytes);
+
+ if (likely(i->nr_segs == 1)) {
+ i->iov_offset += bytes;
+ i->count -= bytes;
+ } else {
+ const struct iovec *iov = i->iov;
+ size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
+
+ /*
+ * The !iov->iov_len check ensures we skip over unlikely
+ * zero-length segments (without overruning the iovec).
+ */
+ while (bytes || unlikely(i->count && !iov->iov_len)) {
+ int copy;
+
+ copy = min(bytes, iov->iov_len - base);
+ BUG_ON(!i->count || i->count < copy);
+ i->count -= copy;
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ nr_segs--;
+ base = 0;
+ }
+ }
+ i->iov = iov;
+ i->iov_offset = base;
+ i->nr_segs = nr_segs;
+ }
+}
+EXPORT_SYMBOL(iov_iter_advance);
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+ char __user *buf = i->iov->iov_base + i->iov_offset;
+ bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+ return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+ const struct iovec *iov = i->iov;
+ if (i->nr_segs == 1)
+ return i->count;
+ else
+ return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
diff --git a/mm/memblock.c b/mm/memblock.c
index 7fe5354e7552..e9d6ca9a01a9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1253,7 +1253,7 @@ phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
pages += end_pfn - start_pfn;
}
- return (phys_addr_t)pages << PAGE_SHIFT;
+ return PFN_PHYS(pages);
}
/* lowest address */
@@ -1271,16 +1271,14 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void)
void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
- unsigned long i;
phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ struct memblock_region *r;
if (!limit)
return;
/* find out max address */
- for (i = 0; i < memblock.memory.cnt; i++) {
- struct memblock_region *r = &memblock.memory.regions[i];
-
+ for_each_memblock(memory, r) {
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -1326,7 +1324,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
{
struct memblock_type *type = &memblock.memory;
- int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
+ int mid = memblock_search(type, PFN_PHYS(pfn));
if (mid == -1)
return -1;
@@ -1379,13 +1377,12 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
void __init_memblock memblock_trim_memory(phys_addr_t align)
{
- int i;
phys_addr_t start, end, orig_start, orig_end;
- struct memblock_type *mem = &memblock.memory;
+ struct memblock_region *r;
- for (i = 0; i < mem->cnt; i++) {
- orig_start = mem->regions[i].base;
- orig_end = mem->regions[i].base + mem->regions[i].size;
+ for_each_memblock(memory, r) {
+ orig_start = r->base;
+ orig_end = r->base + r->size;
start = round_up(orig_start, align);
end = round_down(orig_end, align);
@@ -1393,11 +1390,12 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
continue;
if (start < end) {
- mem->regions[i].base = start;
- mem->regions[i].size = end - start;
+ r->base = start;
+ r->size = end - start;
} else {
- memblock_remove_region(mem, i);
- i--;
+ memblock_remove_region(&memblock.memory,
+ r - memblock.memory.regions);
+ r--;
}
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dcc8153a1681..29501f040568 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -921,8 +921,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
bool anon, int nr_pages)
{
- preempt_disable();
-
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
* counted as CACHE even if it's on ANON LRU.
@@ -947,8 +945,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
}
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-
- preempt_enable();
}
unsigned long
@@ -1075,22 +1071,15 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
}
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
struct mem_cgroup *memcg = NULL;
- if (!mm)
- return NULL;
- /*
- * Because we have no locks, mm->owner's may be being moved to other
- * cgroup. We use css_tryget() here even if this looks
- * pessimistic (rather than adding locks here).
- */
rcu_read_lock();
do {
memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
if (unlikely(!memcg))
- break;
+ memcg = root_mem_cgroup;
} while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
@@ -1486,7 +1475,7 @@ bool task_in_mem_cgroup(struct task_struct *task,
p = find_lock_task_mm(task);
if (p) {
- curr = try_get_mem_cgroup_from_mm(p->mm);
+ curr = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1500,8 +1489,6 @@ bool task_in_mem_cgroup(struct task_struct *task,
css_get(&curr->css);
rcu_read_unlock();
}
- if (!curr)
- return false;
/*
* We should check use_hierarchy of "memcg" not "curr". Because checking
* use_hierarchy of "curr" here make this function true if hierarchy is
@@ -2588,7 +2575,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
}
-/* See __mem_cgroup_try_charge() for details */
+/* See mem_cgroup_try_charge() for details */
enum {
CHARGE_OK, /* success */
CHARGE_RETRY, /* need to retry but retry is not bad */
@@ -2661,45 +2648,34 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return CHARGE_NOMEM;
}
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update res_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- * 0 ... on success, filling *ptr with a valid memcg pointer.
- * -ENOMEM ... charge failure because of resource limits.
- * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
*
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
*/
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask,
- unsigned int nr_pages,
- struct mem_cgroup **ptr,
- bool oom)
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
{
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct mem_cgroup *memcg = NULL;
int ret;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
/*
- * Unlike gloval-vm's OOM-kill, we're not in memory shortage
- * in system level. So, allow to go ahead dying process in addition to
- * MEMDIE process.
+ * Unlike in global OOM situations, memcg is not in a physical
+ * memory shortage. Allow dying and OOM-killed tasks to
+ * bypass the last charges so that they can exit quickly and
+ * free their memory.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)
- || fatal_signal_pending(current)))
+ if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+ fatal_signal_pending(current)))
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
@@ -2707,73 +2683,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (gfp_mask & __GFP_NOFAIL)
oom = false;
-
- /*
- * We always charge the cgroup the mm_struct belongs to.
- * The mm_struct's mem_cgroup changes on task migration if the
- * thread group leader migrates. It's possible that mm is not
- * set, if so charge the root memcg (happens for pagecache usage).
- */
- if (!*ptr && !mm)
- *ptr = root_mem_cgroup;
again:
- if (*ptr) { /* css should be a valid one */
- memcg = *ptr;
- if (mem_cgroup_is_root(memcg))
- goto done;
- if (consume_stock(memcg, nr_pages))
- goto done;
- css_get(&memcg->css);
- } else {
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(mm->owner);
- /*
- * Because we don't have task_lock(), "p" can exit.
- * In that case, "memcg" can point to root or p can be NULL with
- * race with swapoff. Then, we have small risk of mis-accouning.
- * But such kind of mis-account by race always happens because
- * we don't have cgroup_mutex(). It's overkill and we allo that
- * small race, here.
- * (*) swapoff at el will charge against mm-struct not against
- * task-struct. So, mm->owner can be NULL.
- */
- memcg = mem_cgroup_from_task(p);
- if (!memcg)
- memcg = root_mem_cgroup;
- if (mem_cgroup_is_root(memcg)) {
- rcu_read_unlock();
- goto done;
- }
- if (consume_stock(memcg, nr_pages)) {
- /*
- * It seems dagerous to access memcg without css_get().
- * But considering how consume_stok works, it's not
- * necessary. If consume_stock success, some charges
- * from this memcg are cached on this cpu. So, we
- * don't need to call css_get()/css_tryget() before
- * calling consume_stock().
- */
- rcu_read_unlock();
- goto done;
- }
- /* after here, we may be blocked. we need to get refcnt */
- if (!css_tryget(&memcg->css)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
- }
+ if (consume_stock(memcg, nr_pages))
+ goto done;
do {
bool invoke_oom = oom && !nr_oom_retries;
/* If killed, bypass charge */
- if (fatal_signal_pending(current)) {
- css_put(&memcg->css);
+ if (fatal_signal_pending(current))
goto bypass;
- }
ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
nr_pages, invoke_oom);
@@ -2782,17 +2701,12 @@ again:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
batch = nr_pages;
- css_put(&memcg->css);
- memcg = NULL;
goto again;
case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
- css_put(&memcg->css);
goto nomem;
case CHARGE_NOMEM: /* OOM routine works */
- if (!oom || invoke_oom) {
- css_put(&memcg->css);
+ if (!oom || invoke_oom)
goto nomem;
- }
nr_oom_retries--;
break;
}
@@ -2800,20 +2714,44 @@ again:
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
- css_put(&memcg->css);
done:
- *ptr = memcg;
return 0;
nomem:
- if (!(gfp_mask & __GFP_NOFAIL)) {
- *ptr = NULL;
+ if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
- }
bypass:
- *ptr = root_mem_cgroup;
return -EINTR;
}
+/**
+ * mem_cgroup_try_charge_mm - try charging a mm
+ * @mm: mm_struct to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
+ *
+ * Returns the charged mem_cgroup associated with the given mm_struct or
+ * NULL the charge failed.
+ */
+static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ bool oom)
+
+{
+ struct mem_cgroup *memcg;
+ int ret;
+
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+ css_put(&memcg->css);
+ if (ret == -EINTR)
+ memcg = root_mem_cgroup;
+ else if (ret)
+ memcg = NULL;
+
+ return memcg;
+}
+
/*
* Somemtimes we have to undo a charge we got by try_charge().
* This function is for that and do uncharge, put css's refcnt.
@@ -3009,20 +2947,17 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
- struct mem_cgroup *_memcg;
int ret = 0;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
- _memcg = memcg;
- ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, oom_gfp_allowed(gfp));
-
+ ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
+ oom_gfp_allowed(gfp));
if (ret == -EINTR) {
/*
- * __mem_cgroup_try_charge() chosed to bypass to root due to
+ * mem_cgroup_try_charge() chosed to bypass to root due to
* OOM kill or fatal signal. Since our only options are to
* either fail the allocation or charge it to this cgroup, do
* it as a temporary condition. But we can't fail. From a
@@ -3032,7 +2967,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
*
* This condition will only trigger if the task entered
* memcg_charge_kmem in a sane state, but was OOM-killed during
- * __mem_cgroup_try_charge() above. Tasks that were already
+ * mem_cgroup_try_charge() above. Tasks that were already
* dying when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
@@ -3159,6 +3094,29 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
return 0;
}
+char *memcg_create_cache_name(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
+{
+ static char *buf = NULL;
+
+ /*
+ * We need a mutex here to protect the shared buffer. Since this is
+ * expected to be called only on cache creation, we can employ the
+ * slab_mutex for that purpose.
+ */
+ lockdep_assert_held(&slab_mutex);
+
+ if (!buf) {
+ buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ return NULL;
+ }
+
+ cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
+ return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+ memcg_cache_id(memcg), buf);
+}
+
int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
struct kmem_cache *root_cache)
{
@@ -3182,6 +3140,7 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
s->memcg_params->root_cache = root_cache;
INIT_WORK(&s->memcg_params->destroy,
kmem_cache_destroy_work_func);
+ css_get(&memcg->css);
} else
s->memcg_params->is_root_cache = true;
@@ -3190,6 +3149,10 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
void memcg_free_cache_params(struct kmem_cache *s)
{
+ if (!s->memcg_params)
+ return;
+ if (!s->memcg_params->is_root_cache)
+ css_put(&s->memcg_params->memcg->css);
kfree(s->memcg_params);
}
@@ -3212,9 +3175,6 @@ void memcg_register_cache(struct kmem_cache *s)
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
- css_get(&memcg->css);
-
-
/*
* Since readers won't lock (see cache_from_memcg_idx()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
@@ -3263,10 +3223,8 @@ void memcg_unregister_cache(struct kmem_cache *s)
* after removing it from the memcg_slab_caches list, otherwise we can
* fail to convert memcg_params_to_cache() while traversing the list.
*/
- VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+ VM_BUG_ON(root->memcg_params->memcg_caches[id] != s);
root->memcg_params->memcg_caches[id] = NULL;
-
- css_put(&memcg->css);
}
/*
@@ -3363,55 +3321,10 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
schedule_work(&cachep->memcg_params->destroy);
}
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *s)
-{
- struct kmem_cache *new = NULL;
- static char *tmp_path = NULL, *tmp_name = NULL;
- static DEFINE_MUTEX(mutex); /* protects tmp_name */
-
- BUG_ON(!memcg_can_account_kmem(memcg));
-
- mutex_lock(&mutex);
- /*
- * kmem_cache_create_memcg duplicates the given name and
- * cgroup_name for this name requires RCU context.
- * This static temporary buffer is used to prevent from
- * pointless shortliving allocation.
- */
- if (!tmp_path || !tmp_name) {
- if (!tmp_path)
- tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!tmp_name)
- tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!tmp_path || !tmp_name)
- goto out;
- }
-
- cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
- snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
- memcg_cache_id(memcg), tmp_name);
-
- new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
- (s->flags & ~SLAB_PANIC), s->ctor, s);
- if (new)
- new->allocflags |= __GFP_KMEMCG;
- else
- new = s;
-out:
- mutex_unlock(&mutex);
- return new;
-}
-
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+int __kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
- int i;
-
- if (!s->memcg_params)
- return;
- if (!s->memcg_params->is_root_cache)
- return;
+ int i, failed = 0;
/*
* If the cache is being destroyed, we trust that there is no one else
@@ -3445,16 +3358,14 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
c->memcg_params->dead = false;
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
+
+ if (cache_from_memcg_idx(s, i))
+ failed++;
}
mutex_unlock(&activate_kmem_mutex);
+ return failed;
}
-struct create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
@@ -3472,13 +3383,20 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
mutex_unlock(&memcg->slab_caches_mutex);
}
+struct create_work {
+ struct mem_cgroup *memcg;
+ struct kmem_cache *cachep;
+ struct work_struct work;
+};
+
static void memcg_create_cache_work_func(struct work_struct *w)
{
- struct create_work *cw;
+ struct create_work *cw = container_of(w, struct create_work, work);
+ struct mem_cgroup *memcg = cw->memcg;
+ struct kmem_cache *cachep = cw->cachep;
- cw = container_of(w, struct create_work, work);
- memcg_create_kmem_cache(cw->memcg, cw->cachep);
- css_put(&cw->memcg->css);
+ kmem_cache_create_memcg(memcg, cachep);
+ css_put(&memcg->css);
kfree(cw);
}
@@ -3637,15 +3555,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
if (!current->mm || current->memcg_kmem_skip_account)
return true;
- memcg = try_get_mem_cgroup_from_mm(current->mm);
-
- /*
- * very rare case described in mem_cgroup_from_task. Unfortunately there
- * isn't much we can do without complicating this too much, and it would
- * be gfp-dependent anyway. Just let it go
- */
- if (unlikely(!memcg))
- return true;
+ memcg = get_mem_cgroup_from_mm(current->mm);
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
@@ -3748,19 +3658,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline
-void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
- struct mem_cgroup *to,
- unsigned int nr_pages,
- enum mem_cgroup_stat_index idx)
-{
- /* Update stat data for mem_cgroup */
- preempt_disable();
- __this_cpu_sub(from->stat->count[idx], nr_pages);
- __this_cpu_add(to->stat->count[idx], nr_pages);
- preempt_enable();
-}
-
/**
* mem_cgroup_move_account - move account of the page
* @page: the page
@@ -3806,13 +3703,19 @@ static int mem_cgroup_move_account(struct page *page,
move_lock_mem_cgroup(from, &flags);
- if (!anon && page_mapped(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_FILE_MAPPED);
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
- if (PageWriteback(page))
- mem_cgroup_move_account_page_stat(from, to, nr_pages,
- MEM_CGROUP_STAT_WRITEBACK);
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
@@ -3898,19 +3801,19 @@ out:
return ret;
}
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype)
+int mem_cgroup_charge_anon(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
+ struct mem_cgroup *memcg;
bool oom = true;
- int ret;
+
+ if (mem_cgroup_disabled())
+ return 0;
+
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+ VM_BUG_ON(!mm);
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
@@ -3922,25 +3825,14 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
oom = false;
}
- ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
- if (ret == -ENOMEM)
- return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+ if (!memcg)
+ return -ENOMEM;
+ __mem_cgroup_commit_charge(memcg, page, nr_pages,
+ MEM_CGROUP_CHARGE_TYPE_ANON, false);
return 0;
}
-int mem_cgroup_newpage_charge(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
- if (mem_cgroup_disabled())
- return 0;
- VM_BUG_ON_PAGE(page_mapped(page), page);
- VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
- VM_BUG_ON(!mm);
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
@@ -3952,7 +3844,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
gfp_t mask,
struct mem_cgroup **memcgp)
{
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
int ret;
@@ -3965,31 +3857,29 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
- return 0;
- if (!do_swap_account)
- goto charge_cur_mm;
- memcg = try_get_mem_cgroup_from_page(page);
+ goto out;
+ if (do_swap_account)
+ memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
- goto charge_cur_mm;
- *memcgp = memcg;
- ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
+ memcg = get_mem_cgroup_from_mm(mm);
+ ret = mem_cgroup_try_charge(memcg, mask, 1, true);
css_put(&memcg->css);
if (ret == -EINTR)
- ret = 0;
- return ret;
-charge_cur_mm:
- ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = root_mem_cgroup;
+ else if (ret)
+ return ret;
+out:
+ *memcgp = memcg;
+ return 0;
}
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
- *memcgp = NULL;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled()) {
+ *memcgp = NULL;
return 0;
+ }
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
@@ -3997,12 +3887,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
- int ret;
+ struct mem_cgroup *memcg;
- ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
- if (ret == -EINTR)
- ret = 0;
- return ret;
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ *memcgp = memcg;
+ return 0;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
@@ -4046,11 +3937,11 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
MEM_CGROUP_CHARGE_TYPE_ANON);
}
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ struct mem_cgroup *memcg;
int ret;
if (mem_cgroup_disabled())
@@ -4058,15 +3949,28 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (PageCompound(page))
return 0;
- if (!PageSwapCache(page))
- ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
- else { /* page is swapcache/shmem */
+ if (PageSwapCache(page)) { /* shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
- if (!ret)
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ if (ret)
+ return ret;
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ return 0;
}
- return ret;
+
+ /*
+ * Page cache insertions can happen without an actual mm
+ * context, e.g. during disk probing on boot.
+ */
+ if (unlikely(!mm))
+ memcg = root_mem_cgroup;
+ else {
+ memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
+ if (!memcg)
+ return -ENOMEM;
+ }
+ __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+ return 0;
}
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -6678,8 +6582,7 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL,
- GFP_KERNEL, 1, &memcg, false);
+ ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
if (ret)
/* mem_cgroup_clear_mc() will do uncharge later */
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 82c1e4cf00d1..d0f0bef3be48 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -60,6 +60,7 @@
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/dma-debug.h>
+#include <linux/debugfs.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -1320,9 +1321,9 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* It is undesirable to test vma->vm_file as it
* should be non-null for valid hugetlb area.
* However, vm_file will be NULL in the error
- * cleanup path of do_mmap_pgoff. When
+ * cleanup path of mmap_region. When
* hugetlbfs ->mmap method fails,
- * do_mmap_pgoff() nullifies vma->vm_file
+ * mmap_region() nullifies vma->vm_file
* before calling this function to clean up.
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
@@ -2781,7 +2782,7 @@ reuse:
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
+ set_page_dirty_balance(dirty_page);
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
@@ -2827,7 +2828,7 @@ gotten:
}
__SetPageUptodate(new_page);
- if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
goto oom_free_new;
mmun_start = address & PAGE_MASK;
@@ -3280,7 +3281,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
@@ -3342,7 +3343,22 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
return ret;
}
-static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
+/**
+ * do_set_pte - setup new PTE entry for given page and add reverse page mapping.
+ *
+ * @vma: virtual memory area
+ * @address: user virtual address
+ * @page: page to map
+ * @pte: pointer to target page table entry
+ * @write: true, if new entry is writable
+ * @anon: true, if it's anonymous page
+ *
+ * Caller must hold page table lock relevant for @pte.
+ *
+ * Target users are page handler itself and implementations of
+ * vm_ops->map_pages.
+ */
+void do_set_pte(struct vm_area_struct *vma, unsigned long address,
struct page *page, pte_t *pte, bool write, bool anon)
{
pte_t entry;
@@ -3366,6 +3382,105 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
update_mmu_cache(vma, address, pte);
}
+#define FAULT_AROUND_ORDER 4
+
+#ifdef CONFIG_DEBUG_FS
+static unsigned int fault_around_order = FAULT_AROUND_ORDER;
+
+static int fault_around_order_get(void *data, u64 *val)
+{
+ *val = fault_around_order;
+ return 0;
+}
+
+static int fault_around_order_set(void *data, u64 val)
+{
+ BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE);
+ if (1UL << val > PTRS_PER_PTE)
+ return -EINVAL;
+ fault_around_order = val;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops,
+ fault_around_order_get, fault_around_order_set, "%llu\n");
+
+static int __init fault_around_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL,
+ &fault_around_order_fops);
+ if (!ret)
+ pr_warn("Failed to create fault_around_order in debugfs");
+ return 0;
+}
+late_initcall(fault_around_debugfs);
+
+static inline unsigned long fault_around_pages(void)
+{
+ return 1UL << fault_around_order;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+ return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1);
+}
+#else
+static inline unsigned long fault_around_pages(void)
+{
+ unsigned long nr_pages;
+
+ nr_pages = 1UL << FAULT_AROUND_ORDER;
+ BUILD_BUG_ON(nr_pages > PTRS_PER_PTE);
+ return nr_pages;
+}
+
+static inline unsigned long fault_around_mask(void)
+{
+ return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1);
+}
+#endif
+
+static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pgoff_t pgoff, unsigned int flags)
+{
+ unsigned long start_addr;
+ pgoff_t max_pgoff;
+ struct vm_fault vmf;
+ int off;
+
+ start_addr = max(address & fault_around_mask(), vma->vm_start);
+ off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
+ pte -= off;
+ pgoff -= off;
+
+ /*
+ * max_pgoff is either end of page table or end of vma
+ * or fault_around_pages() from pgoff, depending what is neast.
+ */
+ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
+ PTRS_PER_PTE - 1;
+ max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1,
+ pgoff + fault_around_pages() - 1);
+
+ /* Check if it makes any sense to call ->map_pages */
+ while (!pte_none(*pte)) {
+ if (++pgoff > max_pgoff)
+ return;
+ start_addr += PAGE_SIZE;
+ if (start_addr >= vma->vm_end)
+ return;
+ pte++;
+ }
+
+ vmf.virtual_address = (void __user *) start_addr;
+ vmf.pte = pte;
+ vmf.pgoff = pgoff;
+ vmf.max_pgoff = max_pgoff;
+ vmf.flags = flags;
+ vma->vm_ops->map_pages(vma, &vmf);
+}
+
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
@@ -3373,7 +3488,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *fault_page;
spinlock_t *ptl;
pte_t *pte;
- int ret;
+ int ret = 0;
+
+ /*
+ * Let's call ->map_pages() first and use ->fault() as fallback
+ * if page by the offset is not ready to be mapped (cold cache or
+ * something).
+ */
+ if (vma->vm_ops->map_pages) {
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ do_fault_around(vma, address, pte, pgoff, flags);
+ if (!pte_same(*pte, orig_pte))
+ goto unlock_out;
+ pte_unmap_unlock(pte, ptl);
+ }
ret = __do_fault(vma, address, pgoff, flags, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3387,8 +3515,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return ret;
}
do_set_pte(vma, address, fault_page, pte, false, false);
- pte_unmap_unlock(pte, ptl);
unlock_page(fault_page);
+unlock_out:
+ pte_unmap_unlock(pte, ptl);
return ret;
}
@@ -3408,7 +3537,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e3ab02822799..78e1472933ea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
return err;
}
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy. Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
- if (p->mempolicy)
- p->flags |= PF_MEMPOLICY;
- else
- p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
- mpol_fix_fork_child_flag(current);
-}
-
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
@@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
}
old = current->mempolicy;
current->mempolicy = new;
- mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
@@ -1782,21 +1751,18 @@ static unsigned interleave_nodes(struct mempolicy *policy)
/*
* Depending on the memory policy provide a node from which to allocate the
* next slab entry.
- * @policy must be protected by freeing by the caller. If @policy is
- * the current task's mempolicy, this protection is implicit, as only the
- * task can change it's policy. The system default policy requires no
- * such protection.
*/
-unsigned slab_node(void)
+unsigned int mempolicy_slab_node(void)
{
struct mempolicy *policy;
+ int node = numa_mem_id();
if (in_interrupt())
- return numa_node_id();
+ return node;
policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
- return numa_node_id();
+ return node;
switch (policy->mode) {
case MPOL_PREFERRED:
@@ -1816,11 +1782,11 @@ unsigned slab_node(void)
struct zonelist *zonelist;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
- zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
+ zonelist = &NODE_DATA(node)->node_zonelists[0];
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone ? zone->node : numa_node_id();
+ return zone ? zone->node : node;
}
default:
diff --git a/mm/mempool.c b/mm/mempool.c
index 659aa42bad16..905434f18c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -304,9 +304,9 @@ void mempool_free(void *element, mempool_t *pool)
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
- if (pool->curr_nr < pool->min_nr) {
+ if (unlikely(pool->curr_nr < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
- if (pool->curr_nr < pool->min_nr) {
+ if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
spin_unlock_irqrestore(&pool->lock, flags);
wake_up(&pool->wait);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4e1a68162285..b1eb53634005 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -79,6 +79,7 @@ void clear_page_mlock(struct page *page)
*/
void mlock_vma_page(struct page *page)
{
+ /* Serialize with page migration */
BUG_ON(!PageLocked(page));
if (!TestSetPageMlocked(page)) {
@@ -174,6 +175,7 @@ unsigned int munlock_vma_page(struct page *page)
unsigned int nr_pages;
struct zone *zone = page_zone(page);
+ /* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
/*
diff --git a/mm/mmap.c b/mm/mmap.c
index 46433e137abc..b1202cf81f4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
prev->vm_next = next = vma->vm_next;
if (next)
next->vm_prev = prev;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = prev;
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct *vma = NULL;
+ struct rb_node *rb_node;
+ struct vm_area_struct *vma;
/* Check the cache first. */
- /* (Cache hit rate is typically around 35%.) */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
- struct rb_node *rb_node;
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
+ return vma;
- rb_node = mm->mm_rb.rb_node;
- vma = NULL;
+ rb_node = mm->mm_rb.rb_node;
+ vma = NULL;
- while (rb_node) {
- struct vm_area_struct *vma_tmp;
-
- vma_tmp = rb_entry(rb_node,
- struct vm_area_struct, vm_rb);
-
- if (vma_tmp->vm_end > addr) {
- vma = vma_tmp;
- if (vma_tmp->vm_start <= addr)
- break;
- rb_node = rb_node->rb_left;
- } else
- rb_node = rb_node->rb_right;
- }
- if (vma)
- mm->mmap_cache = vma;
+ while (rb_node) {
+ struct vm_area_struct *tmp;
+
+ tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+ if (tmp->vm_end > addr) {
+ vma = tmp;
+ if (tmp->vm_start <= addr)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
}
+
+ if (vma)
+ vmacache_update(addr, vma);
return vma;
}
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
} else
mm->highest_vm_end = prev ? prev->vm_end : 0;
tail_vma->vm_next = NULL;
- mm->mmap_cache = NULL; /* Kill the cache. */
+
+ /* Kill the cache */
+ vmacache_invalidate(mm);
}
/*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 769a67a15803..c43d557941f8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -36,6 +36,34 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
+/*
+ * For a prot_numa update we only hold mmap_sem for read so there is a
+ * potential race with faulting where a pmd was temporarily none. This
+ * function checks for a transhuge pmd under the appropriate lock. It
+ * returns a pte if it was successfully locked or NULL if it raced with
+ * a transhuge insertion.
+ */
+static pte_t *lock_pte_protection(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, int prot_numa, spinlock_t **ptl)
+{
+ pte_t *pte;
+ spinlock_t *pmdl;
+
+ /* !prot_numa is protected by mmap_sem held for write */
+ if (!prot_numa)
+ return pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+
+ pmdl = pmd_lock(vma->vm_mm, pmd);
+ if (unlikely(pmd_trans_huge(*pmd) || pmd_none(*pmd))) {
+ spin_unlock(pmdl);
+ return NULL;
+ }
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ptl);
+ spin_unlock(pmdl);
+ return pte;
+}
+
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
@@ -45,7 +73,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
unsigned long pages = 0;
- pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
+ if (!pte)
+ return 0;
+
arch_enter_lazy_mmu_mode();
do {
oldpte = *pte;
@@ -109,15 +140,26 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
+ struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
+ unsigned long mni_start = 0;
pmd = pmd_offset(pud, addr);
do {
unsigned long this_pages;
next = pmd_addr_end(addr, end);
+ if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+ continue;
+
+ /* invoke the mmu notifier if the pmd is populated */
+ if (!mni_start) {
+ mni_start = addr;
+ mmu_notifier_invalidate_range_start(mm, mni_start, end);
+ }
+
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
@@ -130,18 +172,21 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pages += HPAGE_PMD_NR;
nr_huge_updates++;
}
+
+ /* huge pmd was handled */
continue;
}
}
- /* fall through */
+ /* fall through, the trans huge pmd just split */
}
- if (pmd_none_or_clear_bad(pmd))
- continue;
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa);
pages += this_pages;
} while (pmd++, addr = next, addr != end);
+ if (mni_start)
+ mmu_notifier_invalidate_range_end(mm, mni_start, end);
+
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
@@ -201,15 +246,12 @@ unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
- struct mm_struct *mm = vma->vm_mm;
unsigned long pages;
- mmu_notifier_invalidate_range_start(mm, start, end);
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot);
else
pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
- mmu_notifier_invalidate_range_end(mm, start, end);
return pages;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index a554e5a451cd..85f8d6698d48 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/file.h>
@@ -24,6 +25,7 @@
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/compiler.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/security.h>
@@ -296,7 +298,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
count = -(unsigned long) addr;
memcpy(addr, buf, count);
- return(count);
+ return count;
}
/*
@@ -459,7 +461,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases);
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
{
}
@@ -768,16 +770,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
*/
static void delete_vma_from_mm(struct vm_area_struct *vma)
{
+ int i;
struct address_space *mapping;
struct mm_struct *mm = vma->vm_mm;
+ struct task_struct *curr = current;
kenter("%p", vma);
protect_vma(vma, 0);
mm->map_count--;
- if (mm->mmap_cache == vma)
- mm->mmap_cache = NULL;
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ /* if the vma is cached, invalidate the entire cache */
+ if (curr->vmacache[i] == vma) {
+ vmacache_invalidate(curr->mm);
+ break;
+ }
+ }
/* remove the VMA from the mapping */
if (vma->vm_file) {
@@ -825,8 +834,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = ACCESS_ONCE(mm->mmap_cache);
- if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+ vma = vmacache_find(mm, addr);
+ if (likely(vma))
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +844,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end > addr) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -874,8 +883,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long end = addr + len;
/* check the cache first */
- vma = mm->mmap_cache;
- if (vma && vma->vm_start == addr && vma->vm_end == end)
+ vma = vmacache_find_exact(mm, addr, end);
+ if (vma)
return vma;
/* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +895,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
if (vma->vm_start > addr)
return NULL;
if (vma->vm_end == end) {
- mm->mmap_cache = vma;
+ vmacache_update(addr, vma);
return vma;
}
}
@@ -1003,8 +1012,7 @@ static int validate_mmap_request(struct file *file,
/* we mustn't privatise shared mappings */
capabilities &= ~BDI_CAP_MAP_COPY;
- }
- else {
+ } else {
/* we're going to read the file into private memory we
* allocate */
if (!(capabilities & BDI_CAP_MAP_COPY))
@@ -1035,23 +1043,20 @@ static int validate_mmap_request(struct file *file,
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
if (prot & PROT_EXEC)
return -EPERM;
- }
- else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
+ } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
if (capabilities & BDI_CAP_EXEC_MAP)
prot |= PROT_EXEC;
}
- }
- else if ((prot & PROT_READ) &&
+ } else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
!(capabilities & BDI_CAP_EXEC_MAP)
) {
/* backing file is not executable, try to copy */
capabilities &= ~BDI_CAP_MAP_DIRECT;
}
- }
- else {
+ } else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
@@ -1659,7 +1664,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* find the first potentially overlapping VMA */
vma = find_vma(mm, start);
if (!vma) {
- static int limit = 0;
+ static int limit;
if (limit < 5) {
printk(KERN_WARNING
"munmap of memory not mmapped by process %d"
@@ -1985,6 +1990,12 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
+void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ BUG();
+}
+EXPORT_SYMBOL(filemap_map_pages);
+
int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
unsigned long size, pgoff_t pgoff)
{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7106cb1aca8e..ef413492a149 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1562,9 +1562,9 @@ pause:
bdi_start_background_writeback(bdi);
}
-void set_page_dirty_balance(struct page *page, int page_mkwrite)
+void set_page_dirty_balance(struct page *page)
{
- if (set_page_dirty(page) || page_mkwrite) {
+ if (set_page_dirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 979378deccbf..5dba2933c9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,7 +295,8 @@ static inline int bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason,
+ unsigned long bad_flags)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -623,7 +624,7 @@ out:
static inline int free_pages_check(struct page *page)
{
- char *bad_reason = NULL;
+ const char *bad_reason = NULL;
unsigned long bad_flags = 0;
if (unlikely(page_mapcount(page)))
@@ -859,7 +860,7 @@ static inline void expand(struct zone *zone, struct page *page,
*/
static inline int check_new_page(struct page *page)
{
- char *bad_reason = NULL;
+ const char *bad_reason = NULL;
unsigned long bad_flags = 0;
if (unlikely(page_mapcount(page)))
@@ -1238,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
}
local_irq_restore(flags);
}
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
-}
-#else
-static bool gfp_thisnode_allocation(gfp_t gfp_mask)
-{
- return false;
-}
#endif
/*
@@ -1583,12 +1575,7 @@ again:
get_pageblock_migratetype(page));
}
- /*
- * NOTE: GFP_THISNODE allocations do not partake in the kswapd
- * aging protocol, so they can't be fair.
- */
- if (!gfp_thisnode_allocation(gfp_flags))
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+ __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1870,7 +1857,7 @@ static void __paginginit init_zone_allows_reclaim(int nid)
{
int i;
- for_each_online_node(i)
+ for_each_node_state(i, N_MEMORY)
if (node_distance(nid, i) <= RECLAIM_DISTANCE)
node_set(i, NODE_DATA(nid)->reclaim_nodes);
else
@@ -1954,23 +1941,12 @@ zonelist_scan:
* zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
- *
- * Try to stay in local zones in the fastpath. If
- * that fails, the slowpath is entered, which will do
- * another pass starting with the local zones, but
- * ultimately fall back to remote zones that do not
- * partake in the fairness round-robin cycle of this
- * zonelist.
- *
- * NOTE: GFP_THISNODE allocations do not partake in
- * the kswapd aging protocol, so they can't be fair.
*/
- if ((alloc_flags & ALLOC_WMARK_LOW) &&
- !gfp_thisnode_allocation(gfp_mask)) {
- if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
- continue;
+ if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
continue;
+ if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+ continue;
}
/*
* When allocating a page cache page for writing, we
@@ -2408,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page;
}
-static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+static void reset_alloc_batches(struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
- if (!(gfp_mask & __GFP_NO_KSWAPD))
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
/*
* Only reset the batches of zones that were actually
- * considered in the fast path, we don't want to
- * thrash fairness information for zones that are not
+ * considered in the fairness pass, we don't want to
+ * trash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle.
*/
if (!zone_local(preferred_zone, zone))
continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
}
}
+static void wake_all_kswapds(unsigned int order,
+ struct zonelist *zonelist,
+ enum zone_type high_zoneidx,
+ struct zone *preferred_zone)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+}
+
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
@@ -2522,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are
* over allocated.
*/
- if (gfp_thisnode_allocation(gfp_mask))
+ if (IS_ENABLED(CONFIG_NUMA) &&
+ (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
restart:
- prepare_slowpath(gfp_mask, order, zonelist,
- high_zoneidx, preferred_zone);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2711,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
- int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+ int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
@@ -2752,12 +2737,29 @@ retry_cpuset:
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
+retry:
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype);
if (unlikely(!page)) {
/*
+ * The first pass makes sure allocations are spread
+ * fairly within the local node. However, the local
+ * node might have free pages left after the fairness
+ * batches are exhausted, and remote zones haven't
+ * even been considered yet. Try once more without
+ * fairness, and include remote zones now, before
+ * entering the slowpath and waking kswapd: prefer
+ * spilling to a remote zone over swapping locally.
+ */
+ if (alloc_flags & ALLOC_FAIR) {
+ reset_alloc_batches(zonelist, high_zoneidx,
+ preferred_zone);
+ alloc_flags &= ~ALLOC_FAIR;
+ goto retry;
+ }
+ /*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
@@ -4919,7 +4921,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
- init_zone_allows_reclaim(nid);
+ if (node_state(nid, N_MEMORY))
+ init_zone_allows_reclaim(nid);
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
#endif
@@ -5070,7 +5073,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_MEMORY]);
- struct memblock_type *type = &memblock.memory;
+ struct memblock_region *r;
/* Need to find movable_zone earlier when movable_node is specified. */
find_usable_zone_for_movable();
@@ -5080,13 +5083,13 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* options.
*/
if (movable_node_is_enabled()) {
- for (i = 0; i < type->cnt; i++) {
- if (!memblock_is_hotpluggable(&type->regions[i]))
+ for_each_memblock(memory, r) {
+ if (!memblock_is_hotpluggable(r))
continue;
- nid = type->regions[i].nid;
+ nid = r->nid;
- usable_startpfn = PFN_DOWN(type->regions[i].base);
+ usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
min(usable_startpfn, zone_movable_pfn[nid]) :
usable_startpfn;
@@ -6544,7 +6547,8 @@ static void dump_page_flags(unsigned long flags)
printk(")\n");
}
-void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
+void dump_page_badflags(struct page *page, const char *reason,
+ unsigned long badflags)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
@@ -6560,8 +6564,8 @@ void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
mem_cgroup_print_bad_page(page);
}
-void dump_page(struct page *page, char *reason)
+void dump_page(struct page *page, const char *reason)
{
dump_page_badflags(page, reason, 0);
}
-EXPORT_SYMBOL_GPL(dump_page);
+EXPORT_SYMBOL(dump_page);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index cb79065c19e5..8505c9262b35 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -23,129 +23,44 @@
/**
* process_vm_rw_pages - read/write pages from task specified
- * @task: task to read/write from
- * @mm: mm for task
- * @process_pages: struct pages area that can store at least
- * nr_pages_to_copy struct page pointers
- * @pa: address of page in task to start copying from/to
+ * @pages: array of pointers to pages we want to copy
* @start_offset: offset in page to start copying from/to
* @len: number of bytes to copy
- * @lvec: iovec array specifying where to copy to/from
- * @lvec_cnt: number of elements in iovec array
- * @lvec_current: index in iovec array we are up to
- * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @iter: where to copy to/from locally
* @vm_write: 0 means copy from, 1 means copy to
- * @nr_pages_to_copy: number of pages to copy
- * @bytes_copied: returns number of bytes successfully copied
* Returns 0 on success, error code otherwise
*/
-static int process_vm_rw_pages(struct task_struct *task,
- struct mm_struct *mm,
- struct page **process_pages,
- unsigned long pa,
- unsigned long start_offset,
- unsigned long len,
- const struct iovec *lvec,
- unsigned long lvec_cnt,
- unsigned long *lvec_current,
- size_t *lvec_offset,
- int vm_write,
- unsigned int nr_pages_to_copy,
- ssize_t *bytes_copied)
+static int process_vm_rw_pages(struct page **pages,
+ unsigned offset,
+ size_t len,
+ struct iov_iter *iter,
+ int vm_write)
{
- int pages_pinned;
- void *target_kaddr;
- int pgs_copied = 0;
- int j;
- int ret;
- ssize_t bytes_to_copy;
- ssize_t rc = 0;
-
- *bytes_copied = 0;
-
- /* Get the pages we're interested in */
- down_read(&mm->mmap_sem);
- pages_pinned = get_user_pages(task, mm, pa,
- nr_pages_to_copy,
- vm_write, 0, process_pages, NULL);
- up_read(&mm->mmap_sem);
-
- if (pages_pinned != nr_pages_to_copy) {
- rc = -EFAULT;
- goto end;
- }
-
/* Do the copy for each page */
- for (pgs_copied = 0;
- (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
- pgs_copied++) {
- /* Make sure we have a non zero length iovec */
- while (*lvec_current < lvec_cnt
- && lvec[*lvec_current].iov_len == 0)
- (*lvec_current)++;
- if (*lvec_current == lvec_cnt)
- break;
-
- /*
- * Will copy smallest of:
- * - bytes remaining in page
- * - bytes remaining in destination iovec
- */
- bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
- len - *bytes_copied);
- bytes_to_copy = min_t(ssize_t, bytes_to_copy,
- lvec[*lvec_current].iov_len
- - *lvec_offset);
-
- target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
-
- if (vm_write)
- ret = copy_from_user(target_kaddr,
- lvec[*lvec_current].iov_base
- + *lvec_offset,
- bytes_to_copy);
- else
- ret = copy_to_user(lvec[*lvec_current].iov_base
- + *lvec_offset,
- target_kaddr, bytes_to_copy);
- kunmap(process_pages[pgs_copied]);
- if (ret) {
- *bytes_copied += bytes_to_copy - ret;
- pgs_copied++;
- rc = -EFAULT;
- goto end;
- }
- *bytes_copied += bytes_to_copy;
- *lvec_offset += bytes_to_copy;
- if (*lvec_offset == lvec[*lvec_current].iov_len) {
- /*
- * Need to copy remaining part of page into the
- * next iovec if there are any bytes left in page
- */
- (*lvec_current)++;
- *lvec_offset = 0;
- start_offset = (start_offset + bytes_to_copy)
- % PAGE_SIZE;
- if (start_offset)
- pgs_copied--;
+ while (len && iov_iter_count(iter)) {
+ struct page *page = *pages++;
+ size_t copy = PAGE_SIZE - offset;
+ size_t copied;
+
+ if (copy > len)
+ copy = len;
+
+ if (vm_write) {
+ if (copy > iov_iter_count(iter))
+ copy = iov_iter_count(iter);
+ copied = iov_iter_copy_from_user(page, iter,
+ offset, copy);
+ iov_iter_advance(iter, copied);
+ set_page_dirty_lock(page);
} else {
- start_offset = 0;
- }
- }
-
-end:
- if (vm_write) {
- for (j = 0; j < pages_pinned; j++) {
- if (j < pgs_copied)
- set_page_dirty_lock(process_pages[j]);
- put_page(process_pages[j]);
+ copied = copy_page_to_iter(page, offset, copy, iter);
}
- } else {
- for (j = 0; j < pages_pinned; j++)
- put_page(process_pages[j]);
+ len -= copied;
+ if (copied < copy && iov_iter_count(iter))
+ return -EFAULT;
+ offset = 0;
}
-
- return rc;
+ return 0;
}
/* Maximum number of pages kmalloc'd to hold struct page's during copy */
@@ -155,67 +70,60 @@ end:
* process_vm_rw_single_vec - read/write pages from task specified
* @addr: start memory address of target process
* @len: size of area to copy to/from
- * @lvec: iovec array specifying where to copy to/from locally
- * @lvec_cnt: number of elements in iovec array
- * @lvec_current: index in iovec array we are up to
- * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @iter: where to copy to/from locally
* @process_pages: struct pages area that can store at least
* nr_pages_to_copy struct page pointers
* @mm: mm for task
* @task: task to read/write from
* @vm_write: 0 means copy from, 1 means copy to
- * @bytes_copied: returns number of bytes successfully copied
* Returns 0 on success or on failure error code
*/
static int process_vm_rw_single_vec(unsigned long addr,
unsigned long len,
- const struct iovec *lvec,
- unsigned long lvec_cnt,
- unsigned long *lvec_current,
- size_t *lvec_offset,
+ struct iov_iter *iter,
struct page **process_pages,
struct mm_struct *mm,
struct task_struct *task,
- int vm_write,
- ssize_t *bytes_copied)
+ int vm_write)
{
unsigned long pa = addr & PAGE_MASK;
unsigned long start_offset = addr - pa;
unsigned long nr_pages;
- ssize_t bytes_copied_loop;
ssize_t rc = 0;
- unsigned long nr_pages_copied = 0;
- unsigned long nr_pages_to_copy;
unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
/ sizeof(struct pages *);
- *bytes_copied = 0;
-
/* Work out address and page range required */
if (len == 0)
return 0;
nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
- while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
- nr_pages_to_copy = min(nr_pages - nr_pages_copied,
- max_pages_per_loop);
+ while (!rc && nr_pages && iov_iter_count(iter)) {
+ int pages = min(nr_pages, max_pages_per_loop);
+ size_t bytes;
- rc = process_vm_rw_pages(task, mm, process_pages, pa,
- start_offset, len,
- lvec, lvec_cnt,
- lvec_current, lvec_offset,
- vm_write, nr_pages_to_copy,
- &bytes_copied_loop);
- start_offset = 0;
- *bytes_copied += bytes_copied_loop;
+ /* Get the pages we're interested in */
+ down_read(&mm->mmap_sem);
+ pages = get_user_pages(task, mm, pa, pages,
+ vm_write, 0, process_pages, NULL);
+ up_read(&mm->mmap_sem);
- if (rc < 0) {
- return rc;
- } else {
- len -= bytes_copied_loop;
- nr_pages_copied += nr_pages_to_copy;
- pa += nr_pages_to_copy * PAGE_SIZE;
- }
+ if (pages <= 0)
+ return -EFAULT;
+
+ bytes = pages * PAGE_SIZE - start_offset;
+ if (bytes > len)
+ bytes = len;
+
+ rc = process_vm_rw_pages(process_pages,
+ start_offset, bytes, iter,
+ vm_write);
+ len -= bytes;
+ start_offset = 0;
+ nr_pages -= pages;
+ pa += pages * PAGE_SIZE;
+ while (pages)
+ put_page(process_pages[--pages]);
}
return rc;
@@ -228,8 +136,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
/**
* process_vm_rw_core - core of reading/writing pages from task specified
* @pid: PID of process to read/write from/to
- * @lvec: iovec array specifying where to copy to/from locally
- * @liovcnt: size of lvec array
+ * @iter: where to copy to/from locally
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
* @flags: currently unused
@@ -238,8 +145,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
* return less bytes than expected if an error occurs during the copying
* process.
*/
-static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
- unsigned long liovcnt,
+static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
const struct iovec *rvec,
unsigned long riovcnt,
unsigned long flags, int vm_write)
@@ -250,13 +156,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
struct mm_struct *mm;
unsigned long i;
ssize_t rc = 0;
- ssize_t bytes_copied_loop;
- ssize_t bytes_copied = 0;
unsigned long nr_pages = 0;
unsigned long nr_pages_iov;
- unsigned long iov_l_curr_idx = 0;
- size_t iov_l_curr_offset = 0;
ssize_t iov_len;
+ size_t total_len = iov_iter_count(iter);
/*
* Work out how many pages of struct pages we're going to need
@@ -310,24 +213,20 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
goto put_task_struct;
}
- for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+ for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
rc = process_vm_rw_single_vec(
(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
- lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
- process_pages, mm, task, vm_write, &bytes_copied_loop);
- bytes_copied += bytes_copied_loop;
- if (rc != 0) {
- /* If we have managed to copy any data at all then
- we return the number of bytes copied. Otherwise
- we return the error code */
- if (bytes_copied)
- rc = bytes_copied;
- goto put_mm;
- }
- }
+ iter, process_pages, mm, task, vm_write);
+
+ /* copied = space before - space after */
+ total_len -= iov_iter_count(iter);
+
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (total_len)
+ rc = total_len;
- rc = bytes_copied;
-put_mm:
mmput(mm);
put_task_struct:
@@ -363,6 +262,7 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
ssize_t rc;
if (flags != 0)
@@ -378,13 +278,14 @@ static ssize_t process_vm_rw(pid_t pid,
if (rc <= 0)
goto free_iovecs;
+ iov_iter_init(&iter, iov_l, liovcnt, rc, 0);
+
rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
iovstack_r, &iov_r);
if (rc <= 0)
goto free_iovecs;
- rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
- vm_write);
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
free_iovecs:
if (iov_r != iovstack_r)
@@ -424,6 +325,7 @@ compat_process_vm_rw(compat_pid_t pid,
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
struct iovec *iov_r = iovstack_r;
+ struct iov_iter iter;
ssize_t rc = -EFAULT;
if (flags != 0)
@@ -439,14 +341,14 @@ compat_process_vm_rw(compat_pid_t pid,
&iov_l);
if (rc <= 0)
goto free_iovecs;
+ iov_iter_init(&iter, iov_l, liovcnt, rc, 0);
rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
&iov_r);
if (rc <= 0)
goto free_iovecs;
- rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
- vm_write);
+ rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
free_iovecs:
if (iov_r != iovstack_r)
diff --git a/mm/readahead.c b/mm/readahead.c
index 29c5e1af5a0c..0ca36a7770b1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -8,9 +8,7 @@
*/
#include <linux/kernel.h>
-#include <linux/fs.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
@@ -20,6 +18,8 @@
#include <linux/syscalls.h>
#include <linux/file.h>
+#include "internal.h"
+
/*
* Initialise a struct file's readahead state. Assumes that the caller has
* memset *ra to zero.
@@ -149,8 +149,7 @@ out:
*
* Returns the number of pages requested, or the maximum amount of I/O allowed.
*/
-static int
-__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
+int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
@@ -244,20 +243,6 @@ unsigned long max_sane_readahead(unsigned long nr)
}
/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-unsigned long ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
-{
- int actual;
-
- actual = __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
-
- return actual;
-}
-
-/*
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
diff --git a/mm/rmap.c b/mm/rmap.c
index 11cf322f8133..9c3e77396d1a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1332,9 +1332,19 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
BUG_ON(!page || PageAnon(page));
if (locked_vma) {
- mlock_vma_page(page); /* no-op if already mlocked */
- if (page == check_page)
+ if (page == check_page) {
+ /* we know we have check_page locked */
+ mlock_vma_page(page);
ret = SWAP_MLOCK;
+ } else if (trylock_page(page)) {
+ /*
+ * If we can lock the page, perform mlock.
+ * Otherwise leave the page alone, it will be
+ * eventually encountered again later.
+ */
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
continue; /* don't unmap */
}
diff --git a/mm/shmem.c b/mm/shmem.c
index a3ba988ec946..9f70e02111c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -683,7 +683,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+ error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -1080,7 +1080,7 @@ repeat:
goto failed;
}
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
@@ -1134,7 +1134,7 @@ repeat:
SetPageSwapBacked(page);
__set_page_locked(page);
- error = mem_cgroup_cache_charge(page, current->mm,
+ error = mem_cgroup_charge_file(page, current->mm,
gfp & GFP_RECLAIM_MASK);
if (error)
goto decused;
@@ -1402,13 +1402,25 @@ shmem_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+ const struct iovec *iov, unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = file_inode(filp);
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
unsigned long offset;
enum sgp_type sgp = SGP_READ;
+ int error = 0;
+ ssize_t retval;
+ size_t count;
+ loff_t *ppos = &iocb->ki_pos;
+ struct iov_iter iter;
+
+ retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+ if (retval)
+ return retval;
+ iov_iter_init(&iter, iov, nr_segs, count, 0);
/*
* Might this read be for a stacking filesystem? Then when reading
@@ -1436,10 +1448,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
break;
}
- desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
- if (desc->error) {
- if (desc->error == -EINVAL)
- desc->error = 0;
+ error = shmem_getpage(inode, index, &page, sgp, NULL);
+ if (error) {
+ if (error == -EINVAL)
+ error = 0;
break;
}
if (page)
@@ -1483,61 +1495,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
*/
- ret = actor(desc, page, offset, nr);
+ ret = copy_page_to_iter(page, offset, nr, &iter);
+ retval += ret;
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
page_cache_release(page);
- if (ret != nr || !desc->count)
+ if (!iov_iter_count(&iter))
break;
-
+ if (ret < nr) {
+ error = -EFAULT;
+ break;
+ }
cond_resched();
}
*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
- file_accessed(filp);
-}
-
-static ssize_t shmem_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs, loff_t pos)
-{
- struct file *filp = iocb->ki_filp;
- ssize_t retval;
- unsigned long seg;
- size_t count;
- loff_t *ppos = &iocb->ki_pos;
-
- retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
- if (retval)
- return retval;
-
- for (seg = 0; seg < nr_segs; seg++) {
- read_descriptor_t desc;
-
- desc.written = 0;
- desc.arg.buf = iov[seg].iov_base;
- desc.count = iov[seg].iov_len;
- if (desc.count == 0)
- continue;
- desc.error = 0;
- do_shmem_file_read(filp, ppos, &desc, file_read_actor);
- retval += desc.written;
- if (desc.error) {
- retval = retval ?: desc.error;
- break;
- }
- if (desc.count > 0)
- break;
- }
- return retval;
+ file_accessed(file);
+ return retval ? retval : error;
}
static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
@@ -1576,7 +1553,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nr_pages = min(req_pages, pipe->buffers);
+ nr_pages = min(req_pages, spd.nr_pages_max);
spd.nr_pages = find_get_pages_contig(mapping, index,
nr_pages, spd.pages);
@@ -2723,6 +2700,7 @@ static const struct super_operations shmem_ops = {
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
+ .map_pages = filemap_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
diff --git a/mm/slab.c b/mm/slab.c
index 9153c802e2fe..388cb1ae6fbc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -157,6 +157,17 @@
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
+#define FREELIST_BYTE_INDEX (((PAGE_SIZE >> BITS_PER_BYTE) \
+ <= SLAB_OBJ_MIN_SIZE) ? 1 : 0)
+
+#if FREELIST_BYTE_INDEX
+typedef unsigned char freelist_idx_t;
+#else
+typedef unsigned short freelist_idx_t;
+#endif
+
+#define SLAB_OBJ_MAX_NUM (1 << sizeof(freelist_idx_t) * BITS_PER_BYTE)
+
/*
* true if a page was allocated from pfmemalloc reserves for network-based
* swap
@@ -277,8 +288,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
* OTOH the cpuarrays can contain lots of objects,
* which could lock up otherwise freeable slabs.
*/
-#define REAPTIMEOUT_CPUC (2*HZ)
-#define REAPTIMEOUT_LIST3 (4*HZ)
+#define REAPTIMEOUT_AC (2*HZ)
+#define REAPTIMEOUT_NODE (4*HZ)
#if STATS
#define STATS_INC_ACTIVE(x) ((x)->num_active++)
@@ -565,9 +576,31 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return cachep->array[smp_processor_id()];
}
-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
+ size_t idx_size, size_t align)
{
- return ALIGN(nr_objs * sizeof(unsigned int), align);
+ int nr_objs;
+ size_t freelist_size;
+
+ /*
+ * Ignore padding for the initial guess. The padding
+ * is at most @align-1 bytes, and @buffer_size is at
+ * least @align. In the worst case, this result will
+ * be one greater than the number of objects that fit
+ * into the memory allocation when taking the padding
+ * into account.
+ */
+ nr_objs = slab_size / (buffer_size + idx_size);
+
+ /*
+ * This calculated number will be either the right
+ * amount, or one greater than what we want.
+ */
+ freelist_size = slab_size - nr_objs * buffer_size;
+ if (freelist_size < ALIGN(nr_objs * idx_size, align))
+ nr_objs--;
+
+ return nr_objs;
}
/*
@@ -600,25 +633,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
nr_objs = slab_size / buffer_size;
} else {
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
- > slab_size)
- nr_objs--;
-
- mgmt_size = slab_mgmt_size(nr_objs, align);
+ nr_objs = calculate_nr_objs(slab_size, buffer_size,
+ sizeof(freelist_idx_t), align);
+ mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
}
*num = nr_objs;
*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
@@ -1067,7 +1084,7 @@ static int init_cache_node_node(int node)
list_for_each_entry(cachep, &slab_caches, list) {
/*
- * Set up the size64 kmemlist for cpu before we can
+ * Set up the kmem_cache_node for cpu before we can
* begin anything. Make sure some other cpu on this
* node has not already allocated this
*/
@@ -1076,12 +1093,12 @@ static int init_cache_node_node(int node)
if (!n)
return -ENOMEM;
kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
/*
- * The l3s don't come and go as CPUs come and
- * go. slab_mutex is sufficient
+ * The kmem_cache_nodes don't come and go as CPUs
+ * come and go. slab_mutex is sufficient
* protection here.
*/
cachep->node[node] = n;
@@ -1406,8 +1423,8 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
for_each_online_node(node) {
cachep->node[node] = &init_kmem_cache_node[index + node];
cachep->node[node]->next_reap = jiffies +
- REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
}
}
@@ -2010,6 +2027,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
if (!num)
continue;
+ /* Can't handle number of objects more than SLAB_OBJ_MAX_NUM */
+ if (num > SLAB_OBJ_MAX_NUM)
+ break;
+
if (flags & CFLGS_OFF_SLAB) {
/*
* Max number of objs-per-slab for caches which
@@ -2017,7 +2038,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
* looping condition in cache_grow().
*/
offslab_limit = size;
- offslab_limit /= sizeof(unsigned int);
+ offslab_limit /= sizeof(freelist_idx_t);
if (num > offslab_limit)
break;
@@ -2103,8 +2124,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
}
cachep->node[numa_mem_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
cpu_cache_get(cachep)->avail = 0;
cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -2243,7 +2264,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* it too early on. Always use on-slab management when
* SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
*/
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+ if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init &&
!(flags & SLAB_NOLEAKTRACE))
/*
* Size is large, assume best to place the slab management obj
@@ -2252,6 +2273,12 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
flags |= CFLGS_OFF_SLAB;
size = ALIGN(size, cachep->align);
+ /*
+ * We should restrict the number of objects in a slab to implement
+ * byte sized index. Refer comment on SLAB_OBJ_MIN_SIZE definition.
+ */
+ if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
+ size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
left_over = calculate_slab_order(cachep, size, cachep->align, flags);
@@ -2259,7 +2286,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
return -E2BIG;
freelist_size =
- ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
+ ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
@@ -2272,7 +2299,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- freelist_size = cachep->num * sizeof(unsigned int);
+ freelist_size = cachep->num * sizeof(freelist_idx_t);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
@@ -2300,10 +2327,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (flags & CFLGS_OFF_SLAB) {
cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
/*
- * This is a possibility for one of the malloc_sizes caches.
+ * This is a possibility for one of the kmalloc_{dma,}_caches.
* But since we go off slab only for object size greater than
- * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
- * this should not happen at all.
+ * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created
+ * in ascending order,this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
@@ -2511,14 +2538,17 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
/*
* Get the memory for a slab management obj.
- * For a slab cache when the slab descriptor is off-slab, slab descriptors
- * always come from malloc_sizes caches. The slab descriptor cannot
- * come from the same cache which is getting created because,
- * when we are searching for an appropriate cache for these
- * descriptors in kmem_cache_create, we search through the malloc_sizes array.
- * If we are creating a malloc_sizes cache here it would not be visible to
- * kmem_find_general_cachep till the initialization is complete.
- * Hence we cannot have freelist_cache same as the original cache.
+ *
+ * For a slab cache when the slab descriptor is off-slab, the
+ * slab descriptor can't come from the same cache which is being created,
+ * Because if it is the case, that means we defer the creation of
+ * the kmalloc_{dma,}_cache of size sizeof(slab descriptor) to this point.
+ * And we eventually call down to __kmem_cache_create(), which
+ * in turn looks up in the kmalloc_{dma,}_caches for the disired-size one.
+ * This is a "chicken-and-egg" problem.
+ *
+ * So the off-slab slab descriptor shall come from the kmalloc_{dma,}_caches,
+ * which are all initialized during kmem_cache_init().
*/
static void *alloc_slabmgmt(struct kmem_cache *cachep,
struct page *page, int colour_off,
@@ -2542,9 +2572,15 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
return freelist;
}
-static inline unsigned int *slab_freelist(struct page *page)
+static inline freelist_idx_t get_free_obj(struct page *page, unsigned char idx)
{
- return (unsigned int *)(page->freelist);
+ return ((freelist_idx_t *)page->freelist)[idx];
+}
+
+static inline void set_free_obj(struct page *page,
+ unsigned char idx, freelist_idx_t val)
+{
+ ((freelist_idx_t *)(page->freelist))[idx] = val;
}
static void cache_init_objs(struct kmem_cache *cachep,
@@ -2589,7 +2625,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
if (cachep->ctor)
cachep->ctor(objp);
#endif
- slab_freelist(page)[i] = i;
+ set_free_obj(page, i, i);
}
}
@@ -2608,7 +2644,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
{
void *objp;
- objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
+ objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
#if DEBUG
WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
@@ -2629,7 +2665,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
- if (slab_freelist(page)[i] == objnr) {
+ if (get_free_obj(page, i) == objnr) {
printk(KERN_ERR "slab: double free detected in cache "
"'%s', objp %p\n", cachep->name, objp);
BUG();
@@ -2637,7 +2673,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
}
#endif
page->active--;
- slab_freelist(page)[page->active] = objnr;
+ set_free_obj(page, page->active, objnr);
}
/*
@@ -2886,9 +2922,9 @@ retry:
/* move slabp to correct slabp list: */
list_del(&page->lru);
if (page->active == cachep->num)
- list_add(&page->list, &n->slabs_full);
+ list_add(&page->lru, &n->slabs_full);
else
- list_add(&page->list, &n->slabs_partial);
+ list_add(&page->lru, &n->slabs_partial);
}
must_grow:
@@ -3027,7 +3063,7 @@ out:
#ifdef CONFIG_NUMA
/*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
*
* If we are in_interrupt, then process context, including cpusets and
* mempolicy, may not apply and should not be used for allocation policy.
@@ -3042,7 +3078,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
- nid_alloc = slab_node();
+ nid_alloc = mempolicy_slab_node();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3074,7 +3110,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
retry:
/*
@@ -3245,11 +3281,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
flags);
- if (likely(ptr))
+ if (likely(ptr)) {
kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
-
- if (unlikely((flags & __GFP_ZERO) && ptr))
- memset(ptr, 0, cachep->object_size);
+ if (unlikely(flags & __GFP_ZERO))
+ memset(ptr, 0, cachep->object_size);
+ }
return ptr;
}
@@ -3259,7 +3295,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
- if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+ if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
@@ -3310,17 +3346,17 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
flags);
prefetchw(objp);
- if (likely(objp))
+ if (likely(objp)) {
kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
-
- if (unlikely((flags & __GFP_ZERO) && objp))
- memset(objp, 0, cachep->object_size);
+ if (unlikely(flags & __GFP_ZERO))
+ memset(objp, 0, cachep->object_size);
+ }
return objp;
}
/*
- * Caller needs to acquire correct kmem_list's list_lock
+ * Caller needs to acquire correct kmem_cache_node's list_lock
*/
static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
int node)
@@ -3574,11 +3610,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
struct kmem_cache *cachep;
void *ret;
- /* If you want to save a few bytes .text space: replace
- * __ with kmem_.
- * Then kmalloc uses the uninlined functions instead of the inline
- * functions.
- */
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
@@ -3670,7 +3701,7 @@ EXPORT_SYMBOL(kfree);
/*
* This initializes kmem_cache_node or resizes various caches for all nodes.
*/
-static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
+static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
{
int node;
struct kmem_cache_node *n;
@@ -3726,8 +3757,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
}
kmem_cache_node_init(n);
- n->next_reap = jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE +
+ ((unsigned long)cachep) % REAPTIMEOUT_NODE;
n->shared = new_shared;
n->alien = new_alien;
n->free_limit = (1 + nr_cpus_node(node)) *
@@ -3813,7 +3844,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
kfree(ccold);
}
kfree(new);
- return alloc_kmemlist(cachep, gfp);
+ return alloc_kmem_cache_node(cachep, gfp);
}
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
@@ -3982,7 +4013,7 @@ static void cache_reap(struct work_struct *w)
if (time_after(n->next_reap, jiffies))
goto next;
- n->next_reap = jiffies + REAPTIMEOUT_LIST3;
+ n->next_reap = jiffies + REAPTIMEOUT_NODE;
drain_array(searchp, n, n->shared, 0, node);
@@ -4003,7 +4034,7 @@ next:
next_reap_node();
out:
/* Set up the next iteration */
- schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
+ schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
}
#ifdef CONFIG_SLABINFO
@@ -4210,7 +4241,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
for (j = page->active; j < c->num; j++) {
/* Skip freed item */
- if (slab_freelist(page)[j] == i) {
+ if (get_free_obj(page, j) == i) {
active = false;
break;
}
diff --git a/mm/slab.h b/mm/slab.h
index 8184a7cde272..3045316b7c9d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -55,12 +55,12 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
struct mem_cgroup;
#ifdef CONFIG_SLUB
struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *));
#else
static inline struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{ return NULL; }
#endif
@@ -119,13 +119,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
return !s->memcg_params || s->memcg_params->is_root_cache;
}
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
- struct mem_cgroup *memcg)
-{
- return (is_root_cache(cachep) && !memcg) ||
- (cachep->memcg_params->memcg == memcg);
-}
-
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
if (!is_root_cache(s))
@@ -204,12 +197,6 @@ static inline bool is_root_cache(struct kmem_cache *s)
return true;
}
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
- struct mem_cgroup *memcg)
-{
- return true;
-}
-
static inline void memcg_bind_pages(struct kmem_cache *s, int order)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1ec3c619ba04..f3cfccf76dda 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -29,8 +29,7 @@ DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
- size_t size)
+static int kmem_cache_sanity_check(const char *name, size_t size)
{
struct kmem_cache *s = NULL;
@@ -57,13 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
}
#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
- /*
- * For simplicity, we won't check this in the list of memcg
- * caches. We have control over memcg naming, and if there
- * aren't duplicates in the global list, there won't be any
- * duplicates in the memcg lists as well.
- */
- if (!memcg && !strcmp(s->name, name)) {
+ if (!strcmp(s->name, name)) {
pr_err("%s (%s): Cache name already exists.\n",
__func__, name);
dump_stack();
@@ -77,8 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
return 0;
}
#else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
- const char *name, size_t size)
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
{
return 0;
}
@@ -139,6 +131,46 @@ unsigned long calculate_alignment(unsigned long flags,
return ALIGN(align, sizeof(void *));
}
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *),
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ int err;
+
+ err = -ENOMEM;
+ s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+ if (!s)
+ goto out;
+
+ s->name = name;
+ s->object_size = object_size;
+ s->size = size;
+ s->align = align;
+ s->ctor = ctor;
+
+ err = memcg_alloc_cache_params(memcg, s, root_cache);
+ if (err)
+ goto out_free_cache;
+
+ err = __kmem_cache_create(s, flags);
+ if (err)
+ goto out_free_cache;
+
+ s->refcount = 1;
+ list_add(&s->list, &slab_caches);
+ memcg_register_cache(s);
+out:
+ if (err)
+ return ERR_PTR(err);
+ return s;
+
+out_free_cache:
+ memcg_free_cache_params(s);
+ kfree(s);
+ goto out;
+}
/*
* kmem_cache_create - Create a cache.
@@ -164,34 +196,21 @@ unsigned long calculate_alignment(unsigned long flags,
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-
struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *),
- struct kmem_cache *parent_cache)
+kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s = NULL;
+ struct kmem_cache *s;
+ char *cache_name;
int err;
get_online_cpus();
mutex_lock(&slab_mutex);
- err = kmem_cache_sanity_check(memcg, name, size);
+ err = kmem_cache_sanity_check(name, size);
if (err)
goto out_unlock;
- if (memcg) {
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can
- * try to create the same cache, but only one of them may
- * succeed. Therefore if we get here and see the cache has
- * already been created, we silently return NULL.
- */
- if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
- goto out_unlock;
- }
-
/*
* Some allocators will constraint the set of valid flags to a subset
* of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -200,50 +219,29 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
*/
flags &= CACHE_CREATE_MASK;
- s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+ s = __kmem_cache_alias(name, size, align, flags, ctor);
if (s)
goto out_unlock;
- err = -ENOMEM;
- s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
- if (!s)
+ cache_name = kstrdup(name, GFP_KERNEL);
+ if (!cache_name) {
+ err = -ENOMEM;
goto out_unlock;
+ }
- s->object_size = s->size = size;
- s->align = calculate_alignment(flags, align, size);
- s->ctor = ctor;
-
- s->name = kstrdup(name, GFP_KERNEL);
- if (!s->name)
- goto out_free_cache;
-
- err = memcg_alloc_cache_params(memcg, s, parent_cache);
- if (err)
- goto out_free_cache;
-
- err = __kmem_cache_create(s, flags);
- if (err)
- goto out_free_cache;
-
- s->refcount = 1;
- list_add(&s->list, &slab_caches);
- memcg_register_cache(s);
+ s = do_kmem_cache_create(cache_name, size, size,
+ calculate_alignment(flags, align, size),
+ flags, ctor, NULL, NULL);
+ if (IS_ERR(s)) {
+ err = PTR_ERR(s);
+ kfree(cache_name);
+ }
out_unlock:
mutex_unlock(&slab_mutex);
put_online_cpus();
if (err) {
- /*
- * There is no point in flooding logs with warnings or
- * especially crashing the system if we fail to create a cache
- * for a memcg. In this case we will be accounting the memcg
- * allocation to the root cgroup until we succeed to create its
- * own cache, but it isn't that critical.
- */
- if (!memcg)
- return NULL;
-
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
@@ -255,52 +253,112 @@ out_unlock:
return NULL;
}
return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
-out_free_cache:
- memcg_free_cache_params(s);
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- goto out_unlock;
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * kmem_cache_create_memcg - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct kmem_cache *s;
+ char *cache_name;
+
+ get_online_cpus();
+ mutex_lock(&slab_mutex);
+
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
+ */
+ if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
+ goto out_unlock;
+
+ cache_name = memcg_create_cache_name(memcg, root_cache);
+ if (!cache_name)
+ goto out_unlock;
+
+ s = do_kmem_cache_create(cache_name, root_cache->object_size,
+ root_cache->size, root_cache->align,
+ root_cache->flags, root_cache->ctor,
+ memcg, root_cache);
+ if (IS_ERR(s)) {
+ kfree(cache_name);
+ goto out_unlock;
+ }
+
+ s->allocflags |= __GFP_KMEMCG;
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+ put_online_cpus();
}
-struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
- return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+ int rc;
+
+ if (!s->memcg_params ||
+ !s->memcg_params->is_root_cache)
+ return 0;
+
+ mutex_unlock(&slab_mutex);
+ rc = __kmem_cache_destroy_memcg_children(s);
+ mutex_lock(&slab_mutex);
+
+ return rc;
}
-EXPORT_SYMBOL(kmem_cache_create);
+#else
+static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+ return 0;
+}
+#endif /* CONFIG_MEMCG_KMEM */
void kmem_cache_destroy(struct kmem_cache *s)
{
- /* Destroy all the children caches if we aren't a memcg cache */
- kmem_cache_destroy_memcg_children(s);
-
get_online_cpus();
mutex_lock(&slab_mutex);
+
s->refcount--;
- if (!s->refcount) {
- list_del(&s->list);
-
- if (!__kmem_cache_shutdown(s)) {
- memcg_unregister_cache(s);
- mutex_unlock(&slab_mutex);
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
-
- memcg_free_cache_params(s);
- kfree(s->name);
- kmem_cache_free(kmem_cache, s);
- } else {
- list_add(&s->list, &slab_caches);
- mutex_unlock(&slab_mutex);
- printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
- s->name);
- dump_stack();
- }
- } else {
- mutex_unlock(&slab_mutex);
+ if (s->refcount)
+ goto out_unlock;
+
+ if (kmem_cache_destroy_memcg_children(s) != 0)
+ goto out_unlock;
+
+ list_del(&s->list);
+ memcg_unregister_cache(s);
+
+ if (__kmem_cache_shutdown(s) != 0) {
+ list_add(&s->list, &slab_caches);
+ memcg_register_cache(s);
+ printk(KERN_ERR "kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ goto out_unlock;
}
+
+ mutex_unlock(&slab_mutex);
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ rcu_barrier();
+
+ memcg_free_cache_params(s);
+ kfree(s->name);
+ kmem_cache_free(kmem_cache, s);
+ goto out_put_cpus;
+
+out_unlock:
+ mutex_unlock(&slab_mutex);
+out_put_cpus:
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slob.c b/mm/slob.c
index 4bf8809dfcce..730cad45d4be 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -111,13 +111,13 @@ static inline int slob_page_free(struct page *sp)
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->list, list);
+ list_add(&sp->lru, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->list);
+ list_del(&sp->lru);
__ClearPageSlobFree(sp);
}
@@ -282,7 +282,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, list) {
+ list_for_each_entry(sp, slob_list, lru) {
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,7 +296,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
continue;
/* Attempt to alloc */
- prev = sp->list.prev;
+ prev = sp->lru.prev;
b = slob_page_alloc(sp, size, align);
if (!b)
continue;
@@ -322,7 +322,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->list);
+ INIT_LIST_HEAD(&sp->lru);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
b = slob_page_alloc(sp, size, align);
diff --git a/mm/slub.c b/mm/slub.c
index fe6d7be22ef0..5e234f1f8853 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -224,7 +224,11 @@ static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
- __this_cpu_inc(s->cpu_slab->stat[si]);
+ /*
+ * The rmw is racy on a preemptible kernel but this is acceptable, so
+ * avoid this_cpu_add()'s irq-disable overhead.
+ */
+ raw_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
@@ -1348,11 +1352,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!page)) {
oo = s->min;
+ alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(flags, node, oo);
+ page = alloc_slab_page(alloc_gfp, node, oo);
if (page)
stat(s, ORDER_FALLBACK);
@@ -1362,7 +1367,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
- kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
+ kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
/*
* Objects from caches that have a constructor don't get
@@ -1685,7 +1690,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
- zonelist = node_zonelist(slab_node(), flags);
+ zonelist = node_zonelist(mempolicy_slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -3685,6 +3690,9 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
+ if (!is_root_cache(s))
+ return 1;
+
if (s->ctor)
return 1;
@@ -3697,9 +3705,8 @@ static int slab_unmergeable(struct kmem_cache *s)
return 0;
}
-static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
- size_t align, unsigned long flags, const char *name,
- void (*ctor)(void *))
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
+ unsigned long flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -3722,7 +3729,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
continue;
if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
- continue;
+ continue;
/*
* Check if alignment is compatible.
* Courtesy of Adrian Drzewiecki
@@ -3733,23 +3740,24 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
if (s->size - size >= sizeof(void *))
continue;
- if (!cache_match_memcg(s, memcg))
- continue;
-
return s;
}
return NULL;
}
struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
- size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- s = find_mergeable(memcg, size, align, flags, name, ctor);
+ s = find_mergeable(size, align, flags, name, ctor);
if (s) {
+ int i;
+ struct kmem_cache *c;
+
s->refcount++;
+
/*
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
@@ -3757,6 +3765,15 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
+ for_each_memcg_cache_index(i) {
+ c = cache_from_memcg_idx(s, i);
+ if (!c)
+ continue;
+ c->object_size = s->object_size;
+ c->inuse = max_t(int, c->inuse,
+ ALIGN(size, sizeof(void *)));
+ }
+
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -5126,6 +5143,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
static struct kset *slab_kset;
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ return s->memcg_params->root_cache->memcg_kset;
+#endif
+ return slab_kset;
+}
+
#define ID_STR_LENGTH 64
/* Create a unique string id for a slab cache:
@@ -5191,26 +5217,39 @@ static int sysfs_slab_add(struct kmem_cache *s)
name = create_unique_id(s);
}
- s->kobj.kset = slab_kset;
+ s->kobj.kset = cache_kset(s);
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err) {
- kobject_put(&s->kobj);
- return err;
- }
+ if (err)
+ goto out_put_kobj;
err = sysfs_create_group(&s->kobj, &slab_attr_group);
- if (err) {
- kobject_del(&s->kobj);
- kobject_put(&s->kobj);
- return err;
+ if (err)
+ goto out_del_kobj;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (is_root_cache(s)) {
+ s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+ if (!s->memcg_kset) {
+ err = -ENOMEM;
+ goto out_del_kobj;
+ }
}
+#endif
+
kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
- kfree(name);
}
- return 0;
+out:
+ if (!unmergeable)
+ kfree(name);
+ return err;
+out_del_kobj:
+ kobject_del(&s->kobj);
+out_put_kobj:
+ kobject_put(&s->kobj);
+ goto out;
}
static void sysfs_slab_remove(struct kmem_cache *s)
@@ -5222,6 +5261,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
+#ifdef CONFIG_MEMCG_KMEM
+ kset_unregister(s->memcg_kset);
+#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
diff --git a/mm/sparse.c b/mm/sparse.c
index 38cad8fd7397..d1b48b691ac8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,10 +5,12 @@
#include <linux/slab.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+
#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
@@ -461,7 +463,7 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
}
#endif
-void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+void __weak __meminit vmemmap_populate_print_last(void)
{
}
diff --git a/mm/util.c b/mm/util.c
index a24aa22f2473..f380af7ea779 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
+#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
@@ -307,7 +308,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
* If the architecture not support this function, simply return with no
* page pinned
*/
-int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+int __weak __get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
return 0;
@@ -338,7 +339,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
* callers need to carefully consider what to use. On many architectures,
* get_user_pages_fast simply falls back to get_user_pages.
*/
-int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
@@ -445,6 +446,54 @@ unsigned long vm_commit_limit(void)
return allowed;
}
+/**
+ * get_cmdline() - copy the cmdline value to a buffer.
+ * @task: the task whose cmdline value to copy.
+ * @buffer: the buffer to copy to.
+ * @buflen: the length of the buffer. Larger cmdline values are truncated
+ * to this length.
+ * Returns the size of the cmdline field copied. Note that the copy does
+ * not guarantee an ending NULL byte.
+ */
+int get_cmdline(struct task_struct *task, char *buffer, int buflen)
+{
+ int res = 0;
+ unsigned int len;
+ struct mm_struct *mm = get_task_mm(task);
+ if (!mm)
+ goto out;
+ if (!mm->arg_end)
+ goto out_mm; /* Shh! No looking before we're done */
+
+ len = mm->arg_end - mm->arg_start;
+
+ if (len > buflen)
+ len = buflen;
+
+ res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+
+ /*
+ * If the nul at the end of args has been overwritten, then
+ * assume application is using setproctitle(3).
+ */
+ if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
+ len = strnlen(buffer, res);
+ if (len < res) {
+ res = len;
+ } else {
+ len = mm->env_end - mm->env_start;
+ if (len > buflen - res)
+ len = buflen - res;
+ res += access_process_vm(task, mm->env_start,
+ buffer+res, len, 0);
+ res = strnlen(buffer, res);
+ }
+ }
+out_mm:
+ mmput(mm);
+out:
+ return res;
+}
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..d4224b397c0e
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+ struct task_struct *g, *p;
+
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
+ /*
+ * Only flush the vmacache pointers as the
+ * mm seqnum is already set and curr's will
+ * be set upon invalidation when the next
+ * lookup is done.
+ */
+ if (mm == p->mm)
+ vmacache_flush(p);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma(). The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own). There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+ return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+ if (vmacache_valid_mm(newvma->vm_mm))
+ current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+ struct task_struct *curr;
+
+ if (!vmacache_valid_mm(mm))
+ return false;
+
+ curr = current;
+ if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+ /*
+ * First attempt will always be invalid, initialize
+ * the new cache for this task here.
+ */
+ curr->vmacache_seqnum = mm->vmacache_seqnum;
+ vmacache_flush(curr);
+ return false;
+ }
+ return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
+ BUG_ON(vma->vm_mm != mm);
+ return vma;
+ }
+ }
+
+ return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ int i;
+
+ if (!vmacache_valid(mm))
+ return NULL;
+
+ for (i = 0; i < VMACACHE_SIZE; i++) {
+ struct vm_area_struct *vma = current->vmacache[i];
+
+ if (vma && vma->vm_start == start && vma->vm_end == end)
+ return vma;
+ }
+
+ return NULL;
+}
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf96803c5b..bf233b283319 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -27,7 +27,9 @@
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
+#include <linux/compiler.h>
#include <linux/llist.h>
+
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -1083,6 +1085,12 @@ EXPORT_SYMBOL(vm_unmap_ram);
* @node: prefer to allocate data structures on this node
* @prot: memory protection to use. PAGE_KERNEL for regular RAM
*
+ * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
+ * faster than vmap so it's good. But if you mix long-life and short-life
+ * objects with vm_map_ram(), it could consume lots of address space through
+ * fragmentation (especially on a 32bit machine). You could see failures in
+ * the end. Please use this function for short-lived objects.
+ *
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
@@ -2181,7 +2189,7 @@ EXPORT_SYMBOL(remap_vmalloc_range);
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
*/
-void __attribute__((weak)) vmalloc_sync_all(void)
+void __weak vmalloc_sync_all(void)
{
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1f56a80a7c41..9b6497eda806 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1862,7 +1862,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct zone *zone = lruvec_zone(lruvec);
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file, free;
+ unsigned long anon, file;
bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
@@ -1916,20 +1916,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
get_lru_size(lruvec, LRU_INACTIVE_FILE);
/*
- * If it's foreseeable that reclaiming the file cache won't be
- * enough to get the zone back into a desirable shape, we have
- * to swap. Better start now and leave the - probably heavily
- * thrashing - remaining file pages alone.
- */
- if (global_reclaim(sc)) {
- free = zone_page_state(zone, NR_FREE_PAGES);
- if (unlikely(file + free <= high_wmark_pages(zone))) {
- scan_balance = SCAN_ANON;
- goto out;
- }
- }
-
- /*
* There is enough inactive page cache, do not reclaim
* anything from the anonymous working set right now.
*/
@@ -2314,15 +2300,18 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long lru_pages = 0;
bool aborted_reclaim = false;
struct reclaim_state *reclaim_state = current->reclaim_state;
+ gfp_t orig_mask;
struct shrink_control shrink = {
.gfp_mask = sc->gfp_mask,
};
+ enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
/*
* If the number of buffer_heads in the machine exceeds the maximum
* allowed level, force direct reclaim to scan the highmem zone as
* highmem pages could be pinning lowmem pages storing buffer_heads
*/
+ orig_mask = sc->gfp_mask;
if (buffer_heads_over_limit)
sc->gfp_mask |= __GFP_HIGHMEM;
@@ -2356,7 +2345,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
* noticeable problem, like transparent huge
* page allocations.
*/
- if (compaction_ready(zone, sc)) {
+ if ((zonelist_zone_idx(z) <= requested_highidx)
+ && compaction_ready(zone, sc)) {
aborted_reclaim = true;
continue;
}
@@ -2393,6 +2383,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
}
}
+ /*
+ * Restore to original mask to avoid the impact on the caller if we
+ * promoted it to __GFP_HIGHMEM.
+ */
+ sc->gfp_mask = orig_mask;
+
return aborted_reclaim;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 197b4c4a9587..302dd076b8bf 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1298,14 +1298,14 @@ static int __init setup_vmstat(void)
#ifdef CONFIG_SMP
int cpu;
- register_cpu_notifier(&vmstat_notifier);
+ cpu_notifier_register_begin();
+ __register_cpu_notifier(&vmstat_notifier);
- get_online_cpus();
for_each_online_cpu(cpu) {
start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
}
- put_online_cpus();
+ cpu_notifier_register_done();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c03ca5e9fe15..36b4591a7a2d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -814,21 +814,32 @@ static void zs_exit(void)
{
int cpu;
+ cpu_notifier_register_begin();
+
for_each_online_cpu(cpu)
zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu);
- unregister_cpu_notifier(&zs_cpu_nb);
+ __unregister_cpu_notifier(&zs_cpu_nb);
+
+ cpu_notifier_register_done();
}
static int zs_init(void)
{
int cpu, ret;
- register_cpu_notifier(&zs_cpu_nb);
+ cpu_notifier_register_begin();
+
+ __register_cpu_notifier(&zs_cpu_nb);
for_each_online_cpu(cpu) {
ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
- if (notifier_to_errno(ret))
+ if (notifier_to_errno(ret)) {
+ cpu_notifier_register_done();
goto fail;
+ }
}
+
+ cpu_notifier_register_done();
+
return 0;
fail:
zs_exit();
diff --git a/mm/zswap.c b/mm/zswap.c
index e55bab9dc41f..aeaef0fb5624 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -89,6 +89,9 @@ static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent,
zswap_max_pool_percent, uint, 0644);
+/* zbud_pool is shared by all of zswap backend */
+static struct zbud_pool *zswap_pool;
+
/*********************************
* compression functions
**********************************/
@@ -160,14 +163,14 @@ static void zswap_comp_exit(void)
* rbnode - links the entry into red-black tree for the appropriate swap type
* refcount - the number of outstanding reference to the entry. This is needed
* to protect against premature freeing of the entry by code
- * concurent calls to load, invalidate, and writeback. The lock
+ * concurrent calls to load, invalidate, and writeback. The lock
* for the zswap_tree structure that contains the entry must
* be held while changing the refcount. Since the lock must
* be held, there is no reason to also make refcount atomic.
* offset - the swap offset for the entry. Index into the red-black tree.
- * handle - zsmalloc allocation handle that stores the compressed page data
+ * handle - zbud allocation handle that stores the compressed page data
* length - the length in bytes of the compressed page data. Needed during
- * decompression
+ * decompression
*/
struct zswap_entry {
struct rb_node rbnode;
@@ -189,7 +192,6 @@ struct zswap_header {
struct zswap_tree {
struct rb_root rbroot;
spinlock_t lock;
- struct zbud_pool *pool;
};
static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
@@ -202,7 +204,7 @@ static struct kmem_cache *zswap_entry_cache;
static int zswap_entry_cache_create(void)
{
zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
- return (zswap_entry_cache == NULL);
+ return zswap_entry_cache == NULL;
}
static void zswap_entry_cache_destory(void)
@@ -282,16 +284,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
}
/*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * Carries out the common pattern of freeing and entry's zbud allocation,
* freeing the entry itself, and decrementing the number of stored pages.
*/
-static void zswap_free_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
+static void zswap_free_entry(struct zswap_entry *entry)
{
- zbud_free(tree->pool, entry->handle);
+ zbud_free(zswap_pool, entry->handle);
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
}
/* caller must hold the tree lock */
@@ -311,7 +312,7 @@ static void zswap_entry_put(struct zswap_tree *tree,
BUG_ON(refcount < 0);
if (refcount == 0) {
zswap_rb_erase(&tree->rbroot, entry);
- zswap_free_entry(tree, entry);
+ zswap_free_entry(entry);
}
}
@@ -387,18 +388,18 @@ static int zswap_cpu_init(void)
{
unsigned long cpu;
- get_online_cpus();
+ cpu_notifier_register_begin();
for_each_online_cpu(cpu)
if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
goto cleanup;
- register_cpu_notifier(&zswap_cpu_notifier_block);
- put_online_cpus();
+ __register_cpu_notifier(&zswap_cpu_notifier_block);
+ cpu_notifier_register_done();
return 0;
cleanup:
for_each_online_cpu(cpu)
__zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
- put_online_cpus();
+ cpu_notifier_register_done();
return -ENOMEM;
}
@@ -407,8 +408,8 @@ cleanup:
**********************************/
static bool zswap_is_full(void)
{
- return (totalram_pages * zswap_max_pool_percent / 100 <
- zswap_pool_pages);
+ return totalram_pages * zswap_max_pool_percent / 100 <
+ zswap_pool_pages;
}
/*********************************
@@ -545,7 +546,6 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
zbud_unmap(pool, handle);
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
- BUG_ON(pool != tree->pool);
/* find and ref zswap entry */
spin_lock(&tree->lock);
@@ -573,13 +573,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zbud_map(tree->pool, entry->handle) +
+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
sizeof(struct zswap_header);
dst = kmap_atomic(page);
ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
entry->length, dst, &dlen);
kunmap_atomic(dst);
- zbud_unmap(tree->pool, entry->handle);
+ zbud_unmap(zswap_pool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -652,7 +652,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
zswap_pool_limit_hit++;
- if (zbud_reclaim_page(tree->pool, 8)) {
+ if (zbud_reclaim_page(zswap_pool, 8)) {
zswap_reject_reclaim_fail++;
ret = -ENOMEM;
goto reject;
@@ -679,7 +679,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* store */
len = dlen + sizeof(struct zswap_header);
- ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
+ ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
&handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
@@ -689,11 +689,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto freepage;
}
- zhdr = zbud_map(tree->pool, handle);
+ zhdr = zbud_map(zswap_pool, handle);
zhdr->swpentry = swp_entry(type, offset);
buf = (u8 *)(zhdr + 1);
memcpy(buf, dst, dlen);
- zbud_unmap(tree->pool, handle);
+ zbud_unmap(zswap_pool, handle);
put_cpu_var(zswap_dstmem);
/* populate entry */
@@ -716,7 +716,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* update stats */
atomic_inc(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
+ zswap_pool_pages = zbud_get_pool_size(zswap_pool);
return 0;
@@ -752,13 +752,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zbud_map(tree->pool, entry->handle) +
+ src = (u8 *)zbud_map(zswap_pool, entry->handle) +
sizeof(struct zswap_header);
dst = kmap_atomic(page);
ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
dst, &dlen);
kunmap_atomic(dst);
- zbud_unmap(tree->pool, entry->handle);
+ zbud_unmap(zswap_pool, entry->handle);
BUG_ON(ret);
spin_lock(&tree->lock);
@@ -804,11 +804,9 @@ static void zswap_frontswap_invalidate_area(unsigned type)
/* walk the tree and free everything */
spin_lock(&tree->lock);
rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(tree, entry);
+ zswap_free_entry(entry);
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);
-
- zbud_destroy_pool(tree->pool);
kfree(tree);
zswap_trees[type] = NULL;
}
@@ -822,20 +820,14 @@ static void zswap_frontswap_init(unsigned type)
struct zswap_tree *tree;
tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
- if (!tree)
- goto err;
- tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
- if (!tree->pool)
- goto freetree;
+ if (!tree) {
+ pr_err("alloc failed, zswap disabled for swap type %d\n", type);
+ return;
+ }
+
tree->rbroot = RB_ROOT;
spin_lock_init(&tree->lock);
zswap_trees[type] = tree;
- return;
-
-freetree:
- kfree(tree);
-err:
- pr_err("alloc failed, zswap disabled for swap type %d\n", type);
}
static struct frontswap_ops zswap_frontswap_ops = {
@@ -907,9 +899,16 @@ static int __init init_zswap(void)
return 0;
pr_info("loading zswap\n");
+
+ zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
+ if (!zswap_pool) {
+ pr_err("zbud pool creation failed\n");
+ goto error;
+ }
+
if (zswap_entry_cache_create()) {
pr_err("entry cache creation failed\n");
- goto error;
+ goto cachefail;
}
if (zswap_comp_init()) {
pr_err("compressor initialization failed\n");
@@ -919,6 +918,7 @@ static int __init init_zswap(void)
pr_err("per-cpu initialization failed\n");
goto pcpufail;
}
+
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
@@ -927,6 +927,8 @@ pcpufail:
zswap_comp_exit();
compfail:
zswap_entry_cache_destory();
+cachefail:
+ zbud_destroy_pool(zswap_pool);
error:
return -ENOMEM;
}