summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c86
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c44
-rw-r--r--mm/slab.c56
-rw-r--r--mm/slub.c36
-rw-r--r--mm/swapfile.c3
-rw-r--r--mm/truncate.c40
8 files changed, 211 insertions, 60 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index c3811bc6b9e3..79c4b2b0b14e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
- unsigned int nr_found, nr_skip;
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
rcu_read_lock();
restart:
- nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, NULL, start, nr_pages);
- ret = 0;
- nr_skip = 0;
- for (i = 0; i < nr_found; i++) {
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
- page = radix_tree_deref_slot((void **)pages[i]);
+ page = radix_tree_deref_slot(slot);
if (unlikely(!page))
continue;
@@ -837,7 +836,7 @@ repeat:
* when entry at index 0 moves out of or back
* to root: none yet gotten, safe to restart.
*/
- WARN_ON(start | i);
+ WARN_ON(iter.index);
goto restart;
}
/*
@@ -845,7 +844,6 @@ repeat:
* here as an exceptional entry: so skip over it -
* we only reach this from invalidate_mapping_pages().
*/
- nr_skip++;
continue;
}
@@ -853,21 +851,16 @@ repeat:
goto repeat;
/* Has the page moved? */
- if (unlikely(page != *((void **)pages[i]))) {
+ if (unlikely(page != *slot)) {
page_cache_release(page);
goto repeat;
}
pages[ret] = page;
- ret++;
+ if (++ret == nr_pages)
+ break;
}
- /*
- * If all entries were removed before we could secure them,
- * try again, because callers stop trying once 0 is returned.
- */
- if (unlikely(!ret && nr_found > nr_skip))
- goto restart;
rcu_read_unlock();
return ret;
}
@@ -887,21 +880,22 @@ repeat:
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
- unsigned int nr_found;
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned int ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
rcu_read_lock();
restart:
- nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
- (void ***)pages, NULL, index, nr_pages);
- ret = 0;
- for (i = 0; i < nr_found; i++) {
+ radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
- page = radix_tree_deref_slot((void **)pages[i]);
+ page = radix_tree_deref_slot(slot);
+ /* The hole, there no reason to continue */
if (unlikely(!page))
- continue;
+ break;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
@@ -924,7 +918,7 @@ repeat:
goto repeat;
/* Has the page moved? */
- if (unlikely(page != *((void **)pages[i]))) {
+ if (unlikely(page != *slot)) {
page_cache_release(page);
goto repeat;
}
@@ -934,14 +928,14 @@ repeat:
* otherwise we can get both false positives and false
* negatives, which is just confusing to the caller.
*/
- if (page->mapping == NULL || page->index != index) {
+ if (page->mapping == NULL || page->index != iter.index) {
page_cache_release(page);
break;
}
pages[ret] = page;
- ret++;
- index++;
+ if (++ret == nr_pages)
+ break;
}
rcu_read_unlock();
return ret;
@@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig);
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages)
{
- unsigned int i;
- unsigned int ret;
- unsigned int nr_found;
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned ret = 0;
+
+ if (unlikely(!nr_pages))
+ return 0;
rcu_read_lock();
restart:
- nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
- (void ***)pages, *index, nr_pages, tag);
- ret = 0;
- for (i = 0; i < nr_found; i++) {
+ radix_tree_for_each_tagged(slot, &mapping->page_tree,
+ &iter, *index, tag) {
struct page *page;
repeat:
- page = radix_tree_deref_slot((void **)pages[i]);
+ page = radix_tree_deref_slot(slot);
if (unlikely(!page))
continue;
@@ -998,21 +993,16 @@ repeat:
goto repeat;
/* Has the page moved? */
- if (unlikely(page != *((void **)pages[i]))) {
+ if (unlikely(page != *slot)) {
page_cache_release(page);
goto repeat;
}
pages[ret] = page;
- ret++;
+ if (++ret == nr_pages)
+ break;
}
- /*
- * If all entries were removed before we could secure them,
- * try again, because callers stop trying once 0 is returned.
- */
- if (unlikely(!ret && nr_found))
- goto restart;
rcu_read_unlock();
if (ret)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b2ee6df0e9bb..7d698df4a067 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
return 0;
}
+ if (pmd_trans_unstable(pmd))
+ return 0;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (get_mctgt_type(vma, addr, *pte, NULL))
@@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
return 0;
}
+ if (pmd_trans_unstable(pmd))
+ return 0;
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3fc261705b1e..26adea8ca2e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -95,6 +95,8 @@ unsigned long vm_dirty_bytes;
*/
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
+EXPORT_SYMBOL_GPL(dirty_writeback_interval);
+
/*
* The longest time for which data is allowed to remain dirty
*/
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index caea788628e4..a712fb9e04ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg)
}
/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
*/
void drain_all_pages(void)
{
- on_each_cpu(drain_local_pages, NULL, 1);
+ int cpu;
+ struct per_cpu_pageset *pcp;
+ struct zone *zone;
+
+ /*
+ * Allocate in the BSS so we wont require allocation in
+ * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+ */
+ static cpumask_t cpus_with_pcps;
+
+ /*
+ * We don't care about racing with CPU hotplug event
+ * as offline notification will cause the notified
+ * cpu to drain that CPU pcps and on_each_cpu_mask
+ * disables preemption as part of its processing
+ */
+ for_each_online_cpu(cpu) {
+ bool has_pcps = false;
+ for_each_populated_zone(zone) {
+ pcp = per_cpu_ptr(zone->pageset, cpu);
+ if (pcp->pcp.count) {
+ has_pcps = true;
+ break;
+ }
+ }
+ if (has_pcps)
+ cpumask_set_cpu(cpu, &cpus_with_pcps);
+ else
+ cpumask_clear_cpu(cpu, &cpus_with_pcps);
+ }
+ on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
}
#ifdef CONFIG_HIBERNATION
@@ -2308,6 +2344,10 @@ rebalance:
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
+ /* Coredumps can quickly deplete all memory reserves */
+ if ((current->flags & PF_DUMPCORE) &&
+ !(gfp_mask & __GFP_NOFAIL))
+ goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
diff --git a/mm/slab.c b/mm/slab.c
index 29c8716eb7a9..e901a36e2520 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1731,6 +1731,52 @@ static int __init cpucache_init(void)
}
__initcall(cpucache_init);
+static noinline void
+slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
+{
+ struct kmem_list3 *l3;
+ struct slab *slabp;
+ unsigned long flags;
+ int node;
+
+ printk(KERN_WARNING
+ "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
+ nodeid, gfpflags);
+ printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
+ cachep->name, cachep->buffer_size, cachep->gfporder);
+
+ for_each_online_node(node) {
+ unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
+ unsigned long active_slabs = 0, num_slabs = 0;
+
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+
+ spin_lock_irqsave(&l3->list_lock, flags);
+ list_for_each_entry(slabp, &l3->slabs_full, list) {
+ active_objs += cachep->num;
+ active_slabs++;
+ }
+ list_for_each_entry(slabp, &l3->slabs_partial, list) {
+ active_objs += slabp->inuse;
+ active_slabs++;
+ }
+ list_for_each_entry(slabp, &l3->slabs_free, list)
+ num_slabs++;
+
+ free_objects += l3->free_objects;
+ spin_unlock_irqrestore(&l3->list_lock, flags);
+
+ num_slabs += active_slabs;
+ num_objs = num_slabs * cachep->num;
+ printk(KERN_WARNING
+ " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+ node, active_slabs, num_slabs, active_objs, num_objs,
+ free_objects);
+ }
+}
+
/*
* Interface to system's page allocator. No need to hold the cache-lock.
*
@@ -1757,8 +1803,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
flags |= __GFP_RECLAIMABLE;
page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
- if (!page)
+ if (!page) {
+ if (!(flags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(cachep, flags, nodeid);
return NULL;
+ }
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -3696,13 +3745,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (likely(ac->avail < ac->limit)) {
STATS_INC_FREEHIT(cachep);
- ac->entry[ac->avail++] = objp;
- return;
} else {
STATS_INC_FREEMISS(cachep);
cache_flusharray(cachep, ac);
- ac->entry[ac->avail++] = objp;
}
+
+ ac->entry[ac->avail++] = objp;
}
/**
diff --git a/mm/slub.c b/mm/slub.c
index f4a6229848fd..ffe13fdf8144 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -29,6 +29,7 @@
#include <linux/math64.h>
#include <linux/fault-inject.h>
#include <linux/stacktrace.h>
+#include <linux/prefetch.h>
#include <trace/events/kmem.h>
@@ -269,6 +270,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return *(void **)(object + s->offset);
}
+static void prefetch_freepointer(const struct kmem_cache *s, void *object)
+{
+ prefetch(object + s->offset);
+}
+
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
@@ -1560,6 +1566,7 @@ static void *get_partial_node(struct kmem_cache *s,
} else {
page->freelist = t;
available = put_cpu_partial(s, page, 0);
+ stat(s, CPU_PARTIAL_NODE);
}
if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
break;
@@ -1983,6 +1990,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
local_irq_restore(flags);
pobjects = 0;
pages = 0;
+ stat(s, CPU_PARTIAL_DRAIN);
}
}
@@ -1994,7 +2002,6 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
page->next = oldpage;
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
- stat(s, CPU_PARTIAL_FREE);
return pobjects;
}
@@ -2028,9 +2035,17 @@ static void flush_cpu_slab(void *d)
__flush_cpu_slab(s, smp_processor_id());
}
+static bool has_cpu_slab(int cpu, void *info)
+{
+ struct kmem_cache *s = info;
+ struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+
+ return !!(c->page);
+}
+
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu(flush_cpu_slab, s, 1);
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
}
/*
@@ -2319,6 +2334,8 @@ redo:
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
+ void *next_object = get_freepointer_safe(s, object);
+
/*
* The cmpxchg will only match if there was no additional
* operation and if we are on the right processor.
@@ -2334,11 +2351,12 @@ redo:
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
- get_freepointer_safe(s, object), next_tid(tid)))) {
+ next_object, next_tid(tid)))) {
note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
+ prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);
}
@@ -2475,9 +2493,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* If we just froze the page then put it onto the
* per cpu partial list.
*/
- if (new.frozen && !was_frozen)
+ if (new.frozen && !was_frozen) {
put_cpu_partial(s, page, 1);
-
+ stat(s, CPU_PARTIAL_FREE);
+ }
/*
* The list lock was not taken therefore no list
* activity can be necessary.
@@ -3939,13 +3958,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
if (kmem_cache_open(s, n,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
+ up_write(&slub_lock);
if (sysfs_slab_add(s)) {
+ down_write(&slub_lock);
list_del(&s->list);
kfree(n);
kfree(s);
goto err;
}
- up_write(&slub_lock);
return s;
}
kfree(n);
@@ -5069,6 +5089,8 @@ STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
+STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
+STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
#endif
static struct attribute *slab_attrs[] = {
@@ -5134,6 +5156,8 @@ static struct attribute *slab_attrs[] = {
&cmpxchg_double_cpu_fail_attr.attr,
&cpu_partial_alloc_attr.attr,
&cpu_partial_free_attr.attr,
+ &cpu_partial_node_attr.attr,
+ &cpu_partial_drain_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dae42f380d6e..fafc26d1b1dc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
struct page *page = NULL;
struct inode *inode = NULL;
+ if (swap_flags & ~SWAP_FLAGS_VALID)
+ return -EINVAL;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
diff --git a/mm/truncate.c b/mm/truncate.c
index 18aded3a89fc..61a183b89df6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
return 0;
}
+
+/**
+ * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
+ * @inode: inode
+ * @lstart: offset of beginning of hole
+ * @lend: offset of last byte of hole
+ *
+ * This function should typically be called before the filesystem
+ * releases resources associated with the freed range (eg. deallocates
+ * blocks). This way, pagecache will always stay logically coherent
+ * with on-disk format, and the filesystem would not have to deal with
+ * situations such as writepage being called for a page that has already
+ * had its underlying blocks deallocated.
+ */
+void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+ struct address_space *mapping = inode->i_mapping;
+ loff_t unmap_start = round_up(lstart, PAGE_SIZE);
+ loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
+ /*
+ * This rounding is currently just for example: unmap_mapping_range
+ * expands its hole outwards, whereas we want it to contract the hole
+ * inwards. However, existing callers of truncate_pagecache_range are
+ * doing their own page rounding first; and truncate_inode_pages_range
+ * currently BUGs if lend is not pagealigned-1 (it handles partial
+ * page at start of hole, but not partial page at end of hole). Note
+ * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
+ */
+
+ /*
+ * Unlike in truncate_pagecache, unmap_mapping_range is called only
+ * once (before truncating pagecache), and without "even_cows" flag:
+ * hole-punching should not remove private COWed pages from the hole.
+ */
+ if ((u64)unmap_end > (u64)unmap_start)
+ unmap_mapping_range(mapping, unmap_start,
+ 1 + unmap_end - unmap_start, 0);
+ truncate_inode_pages_range(mapping, lstart, lend);
+}
+EXPORT_SYMBOL(truncate_pagecache_range);