From 2e08d20d777e997bf37806b22b471f98fbe6b693 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennisszhou@gmail.com>
Date: Wed, 27 Sep 2017 16:34:59 -0500
Subject: percpu: fix starting offset for chunk statistics traversal

This patch fixes the starting offset used when scanning chunks to
compute the chunk statistics. The value start_offset (and end_offset)
are managed in bytes while the traversal occurs over bits. Thus for the
reserved and dynamic chunk, it may incorrectly skip over the initial
allocations.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu-stats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 6142484e88f7..7a58460bfd27 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -73,7 +73,7 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
 		     last_alloc + 1 : 0;
 
 	as_len = 0;
-	start = chunk->start_offset;
+	start = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
 
 	/*
 	 * If a bit is set in the allocation map, the bound_map identifies
-- 
cgit v1.2.3


From 1fa4df3e688902d033dfda796eb83ae6ad8d0488 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennisszhou@gmail.com>
Date: Wed, 27 Sep 2017 16:35:00 -0500
Subject: percpu: fix iteration to prevent skipping over block

The iterator functions pcpu_next_md_free_region and
pcpu_next_fit_region use the block offset to determine if they have
checked the area in the prior iteration. However, this causes an issue
when the block offset is greater than subsequent block contig hints. If
within the iterator it moves to check subsequent blocks, it may fail in
the second predicate due to the block offset not being cleared. Thus,
this causes the allocator to skip over blocks leading to false failures
when allocating from the reserved chunk. While this happens in the
general case as well, it will only fail if it cannot allocate a new
chunk.

This patch resets the block offset to 0 to pass the second predicate
when checking subseqent blocks within the iterator function.

Signed-off-by: Dennis Zhou <dennisszhou@gmail.com>
Reported-and-tested-by: Luis Henriques <lhenriques@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 mm/percpu.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index 59d44d61f5f1..aa121cef76de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -353,6 +353,8 @@ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
 					block->contig_hint_start);
 			return;
 		}
+		/* reset to satisfy the second predicate above */
+		block_off = 0;
 
 		*bits = block->right_free;
 		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
@@ -407,6 +409,8 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
 			*bit_off = pcpu_block_off_to_off(i, block->first_free);
 			return;
 		}
+		/* reset to satisfy the second predicate above */
+		block_off = 0;
 
 		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
 				 align);
-- 
cgit v1.2.3


From 4b22927f0cbd58303aac689e378d20bf56267a39 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Tue, 3 Oct 2017 16:14:27 -0700
Subject: ksm: fix unlocked iteration over vmas in cmp_and_merge_page()

In this place mm is unlocked, so vmas or list may change.  Down read
mmap_sem to protect them from modifications.

Link: http://lkml.kernel.org/r/150512788393.10691.8868381099691121308.stgit@localhost.localdomain
Fixes: e86c59b1b12d ("mm/ksm: improve deduplication of zero pages with colouring")
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: zhong jiang <zhongjiang@huawei.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Claudio Imbrenda <imbrenda@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index 15dd7415f7b3..6cb60f46cce5 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1990,6 +1990,7 @@ static void stable_tree_append(struct rmap_item *rmap_item,
  */
 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 {
+	struct mm_struct *mm = rmap_item->mm;
 	struct rmap_item *tree_rmap_item;
 	struct page *tree_page = NULL;
 	struct stable_node *stable_node;
@@ -2062,9 +2063,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 	if (ksm_use_zero_pages && (checksum == zero_checksum)) {
 		struct vm_area_struct *vma;
 
-		vma = find_mergeable_vma(rmap_item->mm, rmap_item->address);
+		down_read(&mm->mmap_sem);
+		vma = find_mergeable_vma(mm, rmap_item->address);
 		err = try_to_merge_one_page(vma, page,
 					    ZERO_PAGE(rmap_item->address));
+		up_read(&mm->mmap_sem);
 		/*
 		 * In case of failure, the page was not really empty, so we
 		 * need to continue. Otherwise we're done.
-- 
cgit v1.2.3


From 19bfbe22f59a207417b2679e7e83c180419c9ec5 Mon Sep 17 00:00:00 2001
From: Alexandru Moise <00moses.alexander00@gmail.com>
Date: Tue, 3 Oct 2017 16:14:31 -0700
Subject: mm, hugetlb, soft_offline: save compound page order before page
 migration

This fixes a bug in madvise() where if you'd try to soft offline a
hugepage via madvise(), while walking the address range you'd end up,
using the wrong page offset due to attempting to get the compound order
of a former but presently not compound page, due to dissolving the huge
page (since commit c3114a84f7f9: "mm: hugetlb: soft-offline: dissolve
source hugepage after successful migration").

As a result I ended up with all my free pages except one being offlined.

Link: http://lkml.kernel.org/r/20170912204306.GA12053@gmail.com
Fixes: c3114a84f7f9 ("mm: hugetlb: soft-offline: dissolve source hugepage after successful migration")
Signed-off-by: Alexandru Moise <00moses.alexander00@gmail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 21261ff0466f..25bade36e9ca 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -625,18 +625,26 @@ static int madvise_inject_error(int behavior,
 {
 	struct page *page;
 	struct zone *zone;
+	unsigned int order;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	for (; start < end; start += PAGE_SIZE <<
-				compound_order(compound_head(page))) {
+
+	for (; start < end; start += PAGE_SIZE << order) {
 		int ret;
 
 		ret = get_user_pages_fast(start, 1, 0, &page);
 		if (ret != 1)
 			return ret;
 
+		/*
+		 * When soft offlining hugepages, after migrating the page
+		 * we dissolve it, therefore in the second loop "page" will
+		 * no longer be a compound page, and order will be 0.
+		 */
+		order = compound_order(compound_head(page));
+
 		if (PageHWPoison(page)) {
 			put_page(page);
 			continue;
-- 
cgit v1.2.3


From d5567c9df1ef001b2a7e6684b3b3498371ee4cae Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Tue, 3 Oct 2017 16:14:47 -0700
Subject: z3fold: fix potential race in z3fold_reclaim_page

It is possible that on a (partially) unsuccessful page reclaim,
kref_put() called in z3fold_reclaim_page() does not yield page release,
but the page is released shortly afterwards by another thread.  Then
z3fold_reclaim_page() would try to list_add() that (released) page again
which is obviously a bug.

To avoid that, spin_lock() has to be taken earlier, before the
kref_put() call mentioned earlier.

Link: http://lkml.kernel.org/r/20170913162937.bfff21c7d12b12a5f47639fd@gmail.com
Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: <Oleksiy.Avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/z3fold.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/z3fold.c b/mm/z3fold.c
index 486550df32be..b04fa3ba1bf2 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -875,16 +875,18 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
 				goto next;
 		}
 next:
+		spin_lock(&pool->lock);
 		if (test_bit(PAGE_HEADLESS, &page->private)) {
 			if (ret == 0) {
+				spin_unlock(&pool->lock);
 				free_z3fold_page(page);
 				return 0;
 			}
 		} else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
 			atomic64_dec(&pool->pages_nr);
+			spin_unlock(&pool->lock);
 			return 0;
 		}
-		spin_lock(&pool->lock);
 
 		/*
 		 * Add to the beginning of LRU.
-- 
cgit v1.2.3


From 4d4bbd8526a8fbeb2c090ea360211fceff952383 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:14:50 -0700
Subject: mm, oom_reaper: skip mm structs with mmu notifiers

Andrea has noticed that the oom_reaper doesn't invalidate the range via
mmu notifiers (mmu_notifier_invalidate_range_start/end) and that can
corrupt the memory of the kvm guest for example.

tlb_flush_mmu_tlbonly already invokes mmu notifiers but that is not
sufficient as per Andrea:

 "mmu_notifier_invalidate_range cannot be used in replacement of
  mmu_notifier_invalidate_range_start/end. For KVM
  mmu_notifier_invalidate_range is a noop and rightfully so. A MMU
  notifier implementation has to implement either ->invalidate_range
  method or the invalidate_range_start/end methods, not both. And if you
  implement invalidate_range_start/end like KVM is forced to do, calling
  mmu_notifier_invalidate_range in common code is a noop for KVM.

  For those MMU notifiers that can get away only implementing
  ->invalidate_range, the ->invalidate_range is implicitly called by
  mmu_notifier_invalidate_range_end(). And only those secondary MMUs
  that share the same pagetable with the primary MMU (like AMD iommuv2)
  can get away only implementing ->invalidate_range"

As the callback is allowed to sleep and the implementation is out of
hand of the MM it is safer to simply bail out if there is an mmu
notifier registered.  In order to not fail too early make the
mm_has_notifiers check under the oom_lock and have a little nap before
failing to give the current oom victim some more time to exit.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20170913113427.2291-1-mhocko@kernel.org
Fixes: aac453635549 ("mm, oom: introduce oom reaper")
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 99736e026712..dee0f75c3013 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -40,6 +40,7 @@
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/init.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -494,6 +495,21 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
 		goto unlock_oom;
 	}
 
+	/*
+	 * If the mm has notifiers then we would need to invalidate them around
+	 * unmap_page_range and that is risky because notifiers can sleep and
+	 * what they do is basically undeterministic.  So let's have a short
+	 * sleep to give the oom victim some more time.
+	 * TODO: we really want to get rid of this ugly hack and make sure that
+	 * notifiers cannot block for unbounded amount of time and add
+	 * mmu_notifier_invalidate_range_{start,end} around unmap_page_range
+	 */
+	if (mm_has_notifiers(mm)) {
+		up_read(&mm->mmap_sem);
+		schedule_timeout_idle(HZ);
+		goto unlock_oom;
+	}
+
 	/*
 	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
 	 * work on the mm anymore. The check for MMF_OOM_SKIP must run
-- 
cgit v1.2.3


From 72f0184c8a00c70179cfed6266e2e06b4d400065 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:14:53 -0700
Subject: mm, memcg: remove hotplug locking from try_charge

The following lockdep splat has been noticed during LTP testing

  ======================================================
  WARNING: possible circular locking dependency detected
  4.13.0-rc3-next-20170807 #12 Not tainted
  ------------------------------------------------------
  a.out/4771 is trying to acquire lock:
   (cpu_hotplug_lock.rw_sem){++++++}, at: [<ffffffff812b4668>] drain_all_stock.part.35+0x18/0x140

  but task is already holding lock:
   (&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

  -> #3 (&mm->mmap_sem){++++++}:
         lock_acquire+0xc9/0x230
         __might_fault+0x70/0xa0
         _copy_to_user+0x23/0x70
         filldir+0xa7/0x110
         xfs_dir2_sf_getdents.isra.10+0x20c/0x2c0 [xfs]
         xfs_readdir+0x1fa/0x2c0 [xfs]
         xfs_file_readdir+0x30/0x40 [xfs]
         iterate_dir+0x17a/0x1a0
         SyS_getdents+0xb0/0x160
         entry_SYSCALL_64_fastpath+0x1f/0xbe

  -> #2 (&type->i_mutex_dir_key#3){++++++}:
         lock_acquire+0xc9/0x230
         down_read+0x51/0xb0
         lookup_slow+0xde/0x210
         walk_component+0x160/0x250
         link_path_walk+0x1a6/0x610
         path_openat+0xe4/0xd50
         do_filp_open+0x91/0x100
         file_open_name+0xf5/0x130
         filp_open+0x33/0x50
         kernel_read_file_from_path+0x39/0x80
         _request_firmware+0x39f/0x880
         request_firmware_direct+0x37/0x50
         request_microcode_fw+0x64/0xe0
         reload_store+0xf7/0x180
         dev_attr_store+0x18/0x30
         sysfs_kf_write+0x44/0x60
         kernfs_fop_write+0x113/0x1a0
         __vfs_write+0x37/0x170
         vfs_write+0xc7/0x1c0
         SyS_write+0x58/0xc0
         do_syscall_64+0x6c/0x1f0
         return_from_SYSCALL_64+0x0/0x7a

  -> #1 (microcode_mutex){+.+.+.}:
         lock_acquire+0xc9/0x230
         __mutex_lock+0x88/0x960
         mutex_lock_nested+0x1b/0x20
         microcode_init+0xbb/0x208
         do_one_initcall+0x51/0x1a9
         kernel_init_freeable+0x208/0x2a7
         kernel_init+0xe/0x104
         ret_from_fork+0x2a/0x40

  -> #0 (cpu_hotplug_lock.rw_sem){++++++}:
         __lock_acquire+0x153c/0x1550
         lock_acquire+0xc9/0x230
         cpus_read_lock+0x4b/0x90
         drain_all_stock.part.35+0x18/0x140
         try_charge+0x3ab/0x6e0
         mem_cgroup_try_charge+0x7f/0x2c0
         shmem_getpage_gfp+0x25f/0x1050
         shmem_fault+0x96/0x200
         __do_fault+0x1e/0xa0
         __handle_mm_fault+0x9c3/0xe00
         handle_mm_fault+0x16e/0x380
         __do_page_fault+0x24a/0x530
         do_page_fault+0x30/0x80
         page_fault+0x28/0x30

  other info that might help us debug this:

  Chain exists of:
    cpu_hotplug_lock.rw_sem --> &type->i_mutex_dir_key#3 --> &mm->mmap_sem

   Possible unsafe locking scenario:

         CPU0                    CPU1
         ----                    ----
    lock(&mm->mmap_sem);
                                 lock(&type->i_mutex_dir_key#3);
                                 lock(&mm->mmap_sem);
    lock(cpu_hotplug_lock.rw_sem);

   *** DEADLOCK ***

  2 locks held by a.out/4771:
   #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff8106eb35>] __do_page_fault+0x175/0x530
   #1:  (percpu_charge_mutex){+.+...}, at: [<ffffffff812b4c97>] try_charge+0x397/0x6e0

The problem is very similar to the one fixed by commit a459eeb7b852
("mm, page_alloc: do not depend on cpu hotplug locks inside the
allocator").  We are taking hotplug locks while we can be sitting on top
of basically arbitrary locks.  This just calls for problems.

We can get rid of {get,put}_online_cpus, fortunately.  We do not have to
be worried about races with memory hotplug because drain_local_stock,
which is called from both the WQ draining and the memory hotplug
contexts, is always operating on the local cpu stock with IRQs disabled.

The only thing to be careful about is that the target memcg doesn't
vanish while we are still in drain_all_stock so take a reference on it.

Link: http://lkml.kernel.org/r/20170913090023.28322-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Tested-by: Artem Savkov <asavkov@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 15af3da5af02..696c6529e900 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1777,6 +1777,10 @@ static void drain_local_stock(struct work_struct *dummy)
 	struct memcg_stock_pcp *stock;
 	unsigned long flags;
 
+	/*
+	 * The only protection from memory hotplug vs. drain_stock races is
+	 * that we always operate on local CPU stock here with IRQ disabled
+	 */
 	local_irq_save(flags);
 
 	stock = this_cpu_ptr(&memcg_stock);
@@ -1821,27 +1825,33 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 	/* If someone's already draining, avoid adding running more workers. */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
-	/* Notify other cpus that system-wide "drain" is running */
-	get_online_cpus();
+	/*
+	 * Notify other cpus that system-wide "drain" is running
+	 * We do not care about races with the cpu hotplug because cpu down
+	 * as well as workers from this path always operate on the local
+	 * per-cpu data. CPU up doesn't touch memcg_stock at all.
+	 */
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
 
 		memcg = stock->cached;
-		if (!memcg || !stock->nr_pages)
+		if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
 			continue;
-		if (!mem_cgroup_is_descendant(memcg, root_memcg))
+		if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
+			css_put(&memcg->css);
 			continue;
+		}
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
+		css_put(&memcg->css);
 	}
 	put_cpu();
-	put_online_cpus();
 	mutex_unlock(&percpu_charge_mutex);
 }
 
-- 
cgit v1.2.3


From 3f2eb0287ebd62ec8d6d544f830285302279e6bf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= <jglisse@redhat.com>
Date: Tue, 3 Oct 2017 16:14:57 -0700
Subject: mm/memcg: avoid page count check for zone device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix for 4.14, zone device page always have an elevated refcount of one
and thus page count sanity check in uncharge_page() is inappropriate for
them.

[mhocko@suse.com: nano-optimize VM_BUG_ON in uncharge_page]
Link: http://lkml.kernel.org/r/20170914190011.5217-1-jglisse@redhat.com
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Evgeny Baskakov <ebaskakov@nvidia.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 696c6529e900..d5f3a62887cf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5658,7 +5658,8 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 {
 	VM_BUG_ON_PAGE(PageLRU(page), page);
-	VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+	VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
+			!PageHWPoison(page) , page);
 
 	if (!page->mem_cgroup)
 		return;
-- 
cgit v1.2.3


From 6818600ff094ca255a7fe31838ad50c29656c3c5 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 3 Oct 2017 16:15:03 -0700
Subject: mm,compaction: serialize waitqueue_active() checks (for real)

Andrea brought to my attention that the L->{L,S} guarantees are
completely bogus for this case.  I was looking at the diagram, from the
offending commit, when that _is_ the race, we had the load reordered
already.

What we need is at least S->L semantics, thus simply use
wq_has_sleeper() to serialize the call for good.

Link: http://lkml.kernel.org/r/20170914175313.GB811@linux-80c1.suse
Fixes: 46acef048a6 (mm,compaction: serialize waitqueue_active() checks)
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reported-by: Andrea Parri <parri.andrea@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index fb548e4c7bd4..03d31a875341 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1999,17 +1999,14 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
 	if (pgdat->kcompactd_max_order < order)
 		pgdat->kcompactd_max_order = order;
 
-	/*
-	 * Pairs with implicit barrier in wait_event_freezable()
-	 * such that wakeups are not missed in the lockless
-	 * waitqueue_active() call.
-	 */
-	smp_acquire__after_ctrl_dep();
-
 	if (pgdat->kcompactd_classzone_idx > classzone_idx)
 		pgdat->kcompactd_classzone_idx = classzone_idx;
 
-	if (!waitqueue_active(&pgdat->kcompactd_wait))
+	/*
+	 * Pairs with implicit barrier in wait_event_freezable()
+	 * such that wakeups are not missed.
+	 */
+	if (!wq_has_sleeper(&pgdat->kcompactd_wait))
 		return;
 
 	if (!kcompactd_node_suitable(pgdat))
-- 
cgit v1.2.3


From 3552935742e0d5f0dafd823736f45bdaa7ba672c Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Tue, 3 Oct 2017 16:15:06 -0700
Subject: z3fold: fix stale list handling

Fix the situation when clear_bit() is called for page->private before
the page pointer is actually assigned.  While at it, remove work_busy()
check because it is costly and does not give 100% guarantee anyway.

Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: <Oleksiy.Avramchenko@sony.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/z3fold.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/z3fold.c b/mm/z3fold.c
index b04fa3ba1bf2..b2ba2ba585f3 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -250,6 +250,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
 
 	WARN_ON(!list_empty(&zhdr->buddy));
 	set_bit(PAGE_STALE, &page->private);
+	clear_bit(NEEDS_COMPACTING, &page->private);
 	spin_lock(&pool->lock);
 	if (!list_empty(&page->lru))
 		list_del(&page->lru);
@@ -303,7 +304,6 @@ static void free_pages_work(struct work_struct *w)
 		list_del(&zhdr->buddy);
 		if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
 			continue;
-		clear_bit(NEEDS_COMPACTING, &page->private);
 		spin_unlock(&pool->stale_lock);
 		cancel_work_sync(&zhdr->work);
 		free_z3fold_page(page);
@@ -624,10 +624,8 @@ lookup:
 	 * stale pages list. cancel_work_sync() can sleep so we must make
 	 * sure it won't be called in case we're in atomic context.
 	 */
-	if (zhdr && (can_sleep || !work_pending(&zhdr->work) ||
-	    !unlikely(work_busy(&zhdr->work)))) {
+	if (zhdr && (can_sleep || !work_pending(&zhdr->work))) {
 		list_del(&zhdr->buddy);
-		clear_bit(NEEDS_COMPACTING, &page->private);
 		spin_unlock(&pool->stale_lock);
 		if (can_sleep)
 			cancel_work_sync(&zhdr->work);
-- 
cgit v1.2.3


From 57148a64e823bb1f49112fa52a92a7f372cda892 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 3 Oct 2017 16:15:10 -0700
Subject: mm: meminit: mark init_reserved_page as __meminit

The function is called from __meminit context and calls other __meminit
functions but isn't it self mark as such today:

  WARNING: vmlinux.o(.text.unlikely+0x4516): Section mismatch in reference from the function init_reserved_page() to the function .meminit.text:early_pfn_to_nid()
  The function init_reserved_page() references the function __meminit early_pfn_to_nid().
  This is often because init_reserved_page lacks a __meminit annotation or the annotation of early_pfn_to_nid is wrong.

On most compilers, we don't notice this because the function gets
inlined all the time.  Adding __meminit here fixes the harmless warning
for the old versions and is generally the correct annotation.

Link: http://lkml.kernel.org/r/20170915193149.901180-1-arnd@arndb.de
Fixes: 7e18adb4f80b ("mm: meminit: initialise remaining struct pages in parallel with kswapd")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c841af88836a..38d165a87860 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
 }
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn)
 {
 	pg_data_t *pgdat;
 	int nid, zid;
-- 
cgit v1.2.3


From a872eb2131e91ce7c89a8888974a5e22a272b12f Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Tue, 3 Oct 2017 16:15:16 -0700
Subject: mm: fix RODATA_TEST failure "rodata_test: test data was not read
 only"

On powerpc, RODATA_TEST fails with message the following messages:

  Freeing unused kernel memory: 528K
  rodata_test: test data was not read only

This is because GCC allocates it to .data section:

  c0695034 g     O .data	00000004 rodata_test_data

Since commit 056b9d8a7692 ("mm: remove rodata_test_data export, add
pr_fmt"), rodata_test_data is used only inside rodata_test.c By
declaring it static, it gets properly allocated into .rodata section
instead of .data:

  c04df710 l     O .rodata	00000004 rodata_test_data

Fixes: 056b9d8a7692 ("mm: remove rodata_test_data export, add pr_fmt")
Link: http://lkml.kernel.org/r/20170921093729.1080368AC1@po15668-vm-win7.idsi0.si.c-s.fr
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jinbum Park <jinb.park7@gmail.com>
Cc: Segher Boessenkool <segher@kernel.crashing.org>
Cc: David Laight <David.Laight@ACULAB.COM>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rodata_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 6bb4deb12e78..d908c8769b48 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -14,7 +14,7 @@
 #include <linux/uaccess.h>
 #include <asm/sections.h>
 
-const int rodata_test_data = 0xC3;
+static const int rodata_test_data = 0xC3;
 
 void rodata_test(void)
 {
-- 
cgit v1.2.3


From f4e222c56c83b2aed7cc2b329fca7435508eefa1 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 3 Oct 2017 16:15:25 -0700
Subject: mm: have filemap_check_and_advance_wb_err clear AS_EIO/AS_ENOSPC

Eryu noticed that he could sometimes get a leftover error reported when
it shouldn't be on fsync with ext2 and non-journalled ext4.

The problem is that writeback_single_inode still uses filemap_fdatawait.
That picks up a previously set AS_EIO flag, which would ordinarily have
been cleared before.

Since we're mostly using this function as a replacement for
filemap_check_errors, have filemap_check_and_advance_wb_err clear AS_EIO
and AS_ENOSPC when reporting an error.  That should allow the new
function to better emulate the behavior of the old with respect to these
flags.

Link: http://lkml.kernel.org/r/20170922133331.28812-1-jlayton@kernel.org
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reported-by: Eryu Guan <eguan@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index db250d0e0565..594d73fef8b4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -620,6 +620,14 @@ int file_check_and_advance_wb_err(struct file *file)
 		trace_file_check_and_advance_wb_err(file, old);
 		spin_unlock(&file->f_lock);
 	}
+
+	/*
+	 * We're mostly using this function as a drop in replacement for
+	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
+	 * that the legacy code would have had on these flags.
+	 */
+	clear_bit(AS_EIO, &mapping->flags);
+	clear_bit(AS_ENOSPC, &mapping->flags);
 	return err;
 }
 EXPORT_SYMBOL(file_check_and_advance_wb_err);
-- 
cgit v1.2.3


From 24c92eb7dce0a299b8e1a8c5fa585844a53bf7f0 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 3 Oct 2017 16:15:29 -0700
Subject: mm: avoid marking swap cached page as lazyfree

MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear
SwapBacked).  There is no lock to prevent the page is added to swap
cache between these two steps by page reclaim.  Page reclaim could add
the page to swap cache and unmap the page.  After page reclaim, the page
is added back to lru.  At that time, we probably start draining per-cpu
pagevec and mark the page lazyfree.  So the page could be in a state
with SwapBacked cleared and PG_swapcache set.  Next time there is a
refault in the virtual address, do_swap_page can find the page from swap
cache but the page has PageSwapCache false because SwapBacked isn't set,
so do_swap_page will bail out and do nothing.  The task will keep
running into fault handler.

Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Link: http://lkml.kernel.org/r/6537ef3814398c0073630b03f176263bc81f0902.1506446061.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Tested-by: Artem Savkov <asavkov@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>	[4.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 9295ae960d66..a77d68f2c1b6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -575,7 +575,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 			    void *arg)
 {
 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
-	    !PageUnevictable(page)) {
+	    !PageSwapCache(page) && !PageUnevictable(page)) {
 		bool active = PageActive(page);
 
 		del_page_from_lru_list(page, lruvec,
@@ -665,7 +665,7 @@ void deactivate_file_page(struct page *page)
 void mark_page_lazyfree(struct page *page)
 {
 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
-	    !PageUnevictable(page)) {
+	    !PageSwapCache(page) && !PageUnevictable(page)) {
 		struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
 
 		get_page(page);
-- 
cgit v1.2.3


From 9625456cc76391b7f3f2809579126542a8ed4d39 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 3 Oct 2017 16:15:32 -0700
Subject: mm: fix data corruption caused by lazyfree page

MADV_FREE clears pte dirty bit and then marks the page lazyfree (clear
SwapBacked).  There is no lock to prevent the page is added to swap
cache between these two steps by page reclaim.  If page reclaim finds
such page, it will simply add the page to swap cache without pageout the
page to swap because the page is marked as clean.  Next time, page fault
will read data from the swap slot which doesn't have the original data,
so we have a data corruption.  To fix issue, we mark the page dirty and
pageout the page.

However, we shouldn't dirty all pages which is clean and in swap cache.
swapin page is swap cache and clean too.  So we only dirty page which is
added into swap cache in page reclaim, which shouldn't be swapin page.
As Minchan suggested, simply dirty the page in add_to_swap can do the
job.

Fixes: 802a3a92ad7a ("mm: reclaim MADV_FREE pages")
Link: http://lkml.kernel.org/r/08c84256b007bf3f63c91d94383bd9eb6fee2daa.1506446061.git.shli@fb.com
Signed-off-by: Shaohua Li <shli@fb.com>
Reported-by: Artem Savkov <asavkov@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: <stable@vger.kernel.org>	[4.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'mm')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 71ce2d1ccbf7..ed91091d1e68 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -242,6 +242,17 @@ int add_to_swap(struct page *page)
 		 * clear SWAP_HAS_CACHE flag.
 		 */
 		goto fail;
+	/*
+	 * Normally the page will be dirtied in unmap because its pte should be
+	 * dirty. A special case is MADV_FREE page. The page'e pte could have
+	 * dirty bit cleared but the page's SwapBacked bit is still set because
+	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
+	 * such page, unmap will not set dirty bit for it, so page reclaim will
+	 * not write the page out. This can cause data corruption when the page
+	 * is swap in later. Always setting the dirty bit for the page solves
+	 * the problem.
+	 */
+	set_page_dirty(page);
 
 	return 1;
 
-- 
cgit v1.2.3


From 7d790d2da386a52cfebcf0c898ba927bece9d4ab Mon Sep 17 00:00:00 2001
From: Reza Arbab <arbab@linux.vnet.ibm.com>
Date: Tue, 3 Oct 2017 16:15:35 -0700
Subject: mm/device-public-memory: fix edge case in _vm_normal_page()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With device public pages at the end of my memory space, I'm getting
output from _vm_normal_page():

  BUG: Bad page map in process migrate_pages  pte:c0800001ffff0d06 pmd:f95d3000
  addr:00007fff89330000 vm_flags:00100073 anon_vma:c0000000fa899320 mapping:          (null) index:7fff8933
  file:          (null) fault:          (null) mmap:          (null) readpage:          (null)
  CPU: 0 PID: 13963 Comm: migrate_pages Tainted: P    B      OE 4.14.0-rc1-wip #155
  Call Trace:
     dump_stack+0xb0/0xf4 (unreliable)
     print_bad_pte+0x28c/0x340
     _vm_normal_page+0xc0/0x140
     zap_pte_range+0x664/0xc10
     unmap_page_range+0x318/0x670
     unmap_vmas+0x74/0xe0
     exit_mmap+0xe8/0x1f0
     mmput+0xac/0x1f0
     do_exit+0x348/0xcd0
     do_group_exit+0x5c/0xf0
     SyS_exit_group+0x1c/0x20
     system_call+0x58/0x6c

The pfn causing this is the very last one.  Correct the bounds check
accordingly.

Fixes: df6ad69838fc ("mm/device-public-memory: device memory cache coherent with CPU")
Link: http://lkml.kernel.org/r/1506092178-20351-1-git-send-email-arbab@linux.vnet.ibm.com
Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Reviewed-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index ec4e15494901..a728bed16c20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -845,7 +845,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		 * vm_normal_page() so that we do not have to special case all
 		 * call site of vm_normal_page().
 		 */
-		if (likely(pfn < highest_memmap_pfn)) {
+		if (likely(pfn <= highest_memmap_pfn)) {
 			struct page *page = pfn_to_page(pfn);
 
 			if (is_device_public_page(page)) {
-- 
cgit v1.2.3


From f80c7dab95a1f0f968acbafe4426ee9525b6f6ab Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 3 Oct 2017 16:16:10 -0700
Subject: mm: memcontrol: use vmalloc fallback for large kmem memcg arrays

For quick per-memcg indexing, slab caches and list_lru structures
maintain linear arrays of descriptors.  As the number of concurrent
memory cgroups in the system goes up, this requires large contiguous
allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for every
existing slab cache and list_lru, which can easily fail on loaded
systems.  E.g.:

  mkdir: page allocation failure: order:5, mode:0x14040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null)
  CPU: 1 PID: 6399 Comm: mkdir Not tainted 4.13.0-mm1-00065-g720bbe532b7c-dirty #481
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
  Call Trace:
   ? __alloc_pages_direct_compact+0x4c/0x110
   __alloc_pages_nodemask+0xf50/0x1430
   alloc_pages_current+0x60/0xc0
   kmalloc_order_trace+0x29/0x1b0
   __kmalloc+0x1f4/0x320
   memcg_update_all_list_lrus+0xca/0x2e0
   mem_cgroup_css_alloc+0x612/0x670
   cgroup_apply_control_enable+0x19e/0x360
   cgroup_mkdir+0x322/0x490
   kernfs_iop_mkdir+0x55/0x80
   vfs_mkdir+0xd0/0x120
   SyS_mkdirat+0x6c/0xe0
   SyS_mkdir+0x14/0x20
   entry_SYSCALL_64_fastpath+0x18/0xad
  Mem-Info:
  active_anon:2965 inactive_anon:19 isolated_anon:0
   active_file:100270 inactive_file:98846 isolated_file:0
   unevictable:0 dirty:0 writeback:0 unstable:0
   slab_reclaimable:7328 slab_unreclaimable:16402
   mapped:771 shmem:52 pagetables:278 bounce:0
   free:13718 free_pcp:0 free_cma:0

This output is from an artificial reproducer, but we have repeatedly
observed order-7 failures in production in the Facebook fleet.  These
systems become useless as they cannot run more jobs, even though there
is plenty of memory to allocate 128 individual pages.

Use kvmalloc and kvzalloc to fall back to vmalloc space if these arrays
prove too large for allocating them physically contiguous.

Link: http://lkml.kernel.org/r/20170918184919.20644-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/list_lru.c    | 12 ++++++------
 mm/slab_common.c | 22 +++++++++++++++-------
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 7a40fa2be858..f141f0c80ff3 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -325,12 +325,12 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 {
 	int size = memcg_nr_cache_ids;
 
-	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+	nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL);
 	if (!nlru->memcg_lrus)
 		return -ENOMEM;
 
 	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
-		kfree(nlru->memcg_lrus);
+		kvfree(nlru->memcg_lrus);
 		return -ENOMEM;
 	}
 
@@ -340,7 +340,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
 static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
 {
 	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
-	kfree(nlru->memcg_lrus);
+	kvfree(nlru->memcg_lrus);
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -351,12 +351,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	BUG_ON(old_size > new_size);
 
 	old = nlru->memcg_lrus;
-	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
 	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
-		kfree(new);
+		kvfree(new);
 		return -ENOMEM;
 	}
 
@@ -373,7 +373,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	nlru->memcg_lrus = new;
 	spin_unlock_irq(&nlru->lock);
 
-	kfree(old);
+	kvfree(old);
 	return 0;
 }
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 904a83be82de..80164599ca5d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -165,9 +165,9 @@ static int init_memcg_params(struct kmem_cache *s,
 	if (!memcg_nr_cache_ids)
 		return 0;
 
-	arr = kzalloc(sizeof(struct memcg_cache_array) +
-		      memcg_nr_cache_ids * sizeof(void *),
-		      GFP_KERNEL);
+	arr = kvzalloc(sizeof(struct memcg_cache_array) +
+		       memcg_nr_cache_ids * sizeof(void *),
+		       GFP_KERNEL);
 	if (!arr)
 		return -ENOMEM;
 
@@ -178,15 +178,23 @@ static int init_memcg_params(struct kmem_cache *s,
 static void destroy_memcg_params(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
-		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+		kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+}
+
+static void free_memcg_params(struct rcu_head *rcu)
+{
+	struct memcg_cache_array *old;
+
+	old = container_of(rcu, struct memcg_cache_array, rcu);
+	kvfree(old);
 }
 
 static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
 	struct memcg_cache_array *old, *new;
 
-	new = kzalloc(sizeof(struct memcg_cache_array) +
-		      new_array_size * sizeof(void *), GFP_KERNEL);
+	new = kvzalloc(sizeof(struct memcg_cache_array) +
+		       new_array_size * sizeof(void *), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 
@@ -198,7 +206,7 @@ static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 
 	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
 	if (old)
-		kfree_rcu(old, rcu);
+		call_rcu(&old->rcu, free_memcg_params);
 	return 0;
 }
 
-- 
cgit v1.2.3


From f64ac5e6e30668216cf489d73ba8a96e372d78c6 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:16:16 -0700
Subject: mm, memory_hotplug: add scheduling point to __add_pages

Patch series "mm, memory_hotplug: fix few soft lockups in memory
hotadd".

Johannes has noticed few soft lockups when adding a large nvdimm device.
All of them were caused by a long loop without any explicit cond_resched
which is a problem for !PREEMPT kernels.

The fix is quite straightforward.  Just make sure that cond_resched gets
called from time to time.

This patch (of 3):

__add_pages gets a pfn range to add and there is no upper bound for a
single call.  This is usually a memory block aligned size for the
regular memory hotplug - smaller sizes are usual for memory balloning
drivers, or the whole NUMA node for physical memory online.  There is no
explicit scheduling point in that code path though.

This can lead to long latencies while __add_pages is executed and we
have even seen a soft lockup report during nvdimm initialization with
!PREEMPT kernel

  NMI watchdog: BUG: soft lockup - CPU#11 stuck for 23s! [kworker/u641:3:832]
  [...]
  Workqueue: events_unbound async_run_entry_fn
  task: ffff881809270f40 ti: ffff881809274000 task.ti: ffff881809274000
  RIP: _raw_spin_unlock_irqrestore+0x11/0x20
  RSP: 0018:ffff881809277b10  EFLAGS: 00000286
  [...]
  Call Trace:
    sparse_add_one_section+0x13d/0x18e
    __add_pages+0x10a/0x1d0
    arch_add_memory+0x4a/0xc0
    devm_memremap_pages+0x29d/0x430
    pmem_attach_disk+0x2fd/0x3f0 [nd_pmem]
    nvdimm_bus_probe+0x64/0x110 [libnvdimm]
    driver_probe_device+0x1f7/0x420
    bus_for_each_drv+0x52/0x80
    __device_attach+0xb0/0x130
    bus_probe_device+0x87/0xa0
    device_add+0x3fc/0x5f0
    nd_async_device_register+0xe/0x40 [libnvdimm]
    async_run_entry_fn+0x43/0x150
    process_one_work+0x14e/0x410
    worker_thread+0x116/0x490
    kthread+0xc7/0xe0
    ret_from_fork+0x3f/0x70
  DWARF2 unwinder stuck at ret_from_fork+0x3f/0x70

Fix this by adding cond_resched once per each memory section in the
given pfn range.  Each section is constant amount of work which itself
is not too expensive but many of them will just add up.

Link: http://lkml.kernel.org/r/20170918121410.24466-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e882cb6da994..23d5bd968950 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -328,6 +328,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 		if (err && (err != -EEXIST))
 			break;
 		err = 0;
+		cond_resched();
 	}
 	vmemmap_populate_print_last();
 out:
-- 
cgit v1.2.3


From 9b6e63cbf85b89b2dbffa4955dbf2df8250e5375 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Tue, 3 Oct 2017 16:16:19 -0700
Subject: mm, page_alloc: add scheduling point to memmap_init_zone

memmap_init_zone gets a pfn range to initialize and it can be really
large resulting in a soft lockup on non-preemptible kernels

  NMI watchdog: BUG: soft lockup - CPU#31 stuck for 23s! [kworker/u642:5:1720]
  [...]
  task: ffff88ecd7e902c0 ti: ffff88eca4e50000 task.ti: ffff88eca4e50000
  RIP: move_pfn_range_to_zone+0x185/0x1d0
  [...]
  Call Trace:
    devm_memremap_pages+0x2c7/0x430
    pmem_attach_disk+0x2fd/0x3f0 [nd_pmem]
    nvdimm_bus_probe+0x64/0x110 [libnvdimm]
    driver_probe_device+0x1f7/0x420
    bus_for_each_drv+0x52/0x80
    __device_attach+0xb0/0x130
    bus_probe_device+0x87/0xa0
    device_add+0x3fc/0x5f0
    nd_async_device_register+0xe/0x40 [libnvdimm]
    async_run_entry_fn+0x43/0x150
    process_one_work+0x14e/0x410
    worker_thread+0x116/0x490
    kthread+0xc7/0xe0
    ret_from_fork+0x3f/0x70

Fix this by adding a scheduling point once per page block.

Link: http://lkml.kernel.org/r/20170918121410.24466-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Johannes Thumshirn <jthumshirn@suse.de>
Cc: Dan Williams <dan.j.williams@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 38d165a87860..77e4d3c5c57b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5367,6 +5367,7 @@ not_early:
 
 			__init_single_page(page, pfn, zone, nid);
 			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+			cond_resched();
 		} else {
 			__init_single_pfn(pfn, zone, nid);
 		}
-- 
cgit v1.2.3


From 1dd2bfc86818ddbc95f98e312e7704350223fd7d Mon Sep 17 00:00:00 2001
From: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Date: Tue, 3 Oct 2017 16:16:29 -0700
Subject: mm/memory_hotplug: change pfn_to_section_nr/section_nr_to_pfn macro
 to inline function

pfn_to_section_nr() and section_nr_to_pfn() are defined as macro.
pfn_to_section_nr() has no issue even if it is defined as macro.  But
section_nr_to_pfn() has overflow issue if sec is defined as int.

section_nr_to_pfn() just shifts sec by PFN_SECTION_SHIFT.  If sec is
defined as unsigned long, section_nr_to_pfn() returns pfn as 64 bit value.
But if sec is defined as int, section_nr_to_pfn() returns pfn as 32 bit
value.

__remove_section() calculates start_pfn using section_nr_to_pfn() and
scn_nr defined as int.  So if hot-removed memory address is over 16TB,
overflow issue occurs and section_nr_to_pfn() does not calculate correct
pfn.

To make callers use proper arg, the patch changes the macros to inline
functions.

Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory")
Link: http://lkml.kernel.org/r/e643a387-e573-6bbf-d418-c60c8ee3d15e@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 23d5bd968950..efd1ad37bb57 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -551,7 +551,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
 		return ret;
 
 	scn_nr = __section_nr(ms);
-	start_pfn = section_nr_to_pfn(scn_nr);
+	start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
 	__remove_zone(zone, start_pfn);
 
 	sparse_remove_one_section(zone, ms, map_offset);
-- 
cgit v1.2.3


From d09b0137d204bebeaafed672bc5a244e9ac92edb Mon Sep 17 00:00:00 2001
From: YASUAKI ISHIMATSU <yasu.isimatu@gmail.com>
Date: Tue, 3 Oct 2017 16:16:32 -0700
Subject: mm/memory_hotplug: define find_{smallest|biggest}_section_pfn as
 unsigned long

find_{smallest|biggest}_section_pfn()s find the smallest/biggest section
and return the pfn of the section.  But the functions are defined as int.
So the functions always return 0x00000000 - 0xffffffff.  It means if
memory address is over 16TB, the functions does not work correctly.

To handle 64 bit value, the patch defines
find_{smallest|biggest}_section_pfn() as unsigned long.

Fixes: 815121d2b5cd ("memory_hotplug: clear zone when removing the memory")
Link: http://lkml.kernel.org/r/d9d5593a-d0a4-c4be-ab08-493df59a85c6@gmail.com
Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index efd1ad37bb57..d4b5f29906b9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(__add_pages);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
-static int find_smallest_section_pfn(int nid, struct zone *zone,
+static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
 				     unsigned long end_pfn)
 {
@@ -363,7 +363,7 @@ static int find_smallest_section_pfn(int nid, struct zone *zone,
 }
 
 /* find the biggest valid pfn in the range [start_pfn, end_pfn). */
-static int find_biggest_section_pfn(int nid, struct zone *zone,
+static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
 				    unsigned long start_pfn,
 				    unsigned long end_pfn)
 {
-- 
cgit v1.2.3


From 9f1c2674b328a69ab5a9b5a1c52405795ee4163f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Oct 2017 21:44:51 -0700
Subject: net: memcontrol: defer call to mem_cgroup_sk_alloc()

Instead of calling mem_cgroup_sk_alloc() from BH context,
it is better to call it from inet_csk_accept() in process context.

Not only this removes code in mem_cgroup_sk_alloc(), but it also
fixes a bug since listener might have been dismantled and css_get()
might cause a use-after-free.

Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 mm/memcontrol.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d5f3a62887cf..661f046ad318 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5828,21 +5828,6 @@ void mem_cgroup_sk_alloc(struct sock *sk)
 	if (!mem_cgroup_sockets_enabled)
 		return;
 
-	/*
-	 * Socket cloning can throw us here with sk_memcg already
-	 * filled. It won't however, necessarily happen from
-	 * process context. So the test for root memcg given
-	 * the current task's memcg won't help us in this case.
-	 *
-	 * Respecting the original socket's memcg is a better
-	 * decision in this case.
-	 */
-	if (sk->sk_memcg) {
-		BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
-		css_get(&sk->sk_memcg->css);
-		return;
-	}
-
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(current);
 	if (memcg == root_mem_cgroup)
-- 
cgit v1.2.3


From e20d103b6c37038ca27409f746f0b3351bcd0c44 Mon Sep 17 00:00:00 2001
From: Mark Hairgrove <mhairgrove@nvidia.com>
Date: Fri, 13 Oct 2017 15:57:30 -0700
Subject: mm/migrate: fix indexing bug (off by one) and avoid out of bound
 access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Index was incremented before last use and thus the second array could
dereference to an invalid address (not mentioning the fact that it did
not properly clear the entry we intended to clear).

Link: http://lkml.kernel.org/r/1506973525-16491-1-git-send-email-jglisse@redhat.com
Fixes: 8315ada7f095bf ("mm/migrate: allow migrate_vma() to alloc new page on empty entry")
Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
Signed-off-by: Jérôme Glisse <jglisse@redhat.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 6954c1435833..e00814ca390e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2146,8 +2146,9 @@ static int migrate_vma_collect_hole(unsigned long start,
 	unsigned long addr;
 
 	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-		migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+		migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
 		migrate->dst[migrate->npages] = 0;
+		migrate->npages++;
 		migrate->cpages++;
 	}
 
-- 
cgit v1.2.3


From c02c30093254189a6ef55fed415a4ffb55a74cdf Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.s@alibaba-inc.com>
Date: Fri, 13 Oct 2017 15:57:37 -0700
Subject: mm/madvise.c: add description for MADV_WIPEONFORK and MADV_KEEPONFORK

mm/madvise.c has a brief description about all MADV_ flags.  Add a
description for the newly added MADV_WIPEONFORK and MADV_KEEPONFORK.

Although man page has the similar information, but it'd better to keep
the consistent with other flags.

Link: http://lkml.kernel.org/r/1506117328-88228-1-git-send-email-yang.s@alibaba-inc.com
Signed-off-by: Yang Shi <yang.s@alibaba-inc.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 25bade36e9ca..fd70d6aabc3e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -757,6 +757,9 @@ madvise_behavior_valid(int behavior)
  *  MADV_DONTFORK - omit this area from child's address space when forking:
  *		typically, to avoid COWing pages pinned by get_user_pages().
  *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
+ *              range after a fork.
+ *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
  *  MADV_HWPOISON - trigger memory error handler as if the given memory range
  *		were corrupted by unrecoverable hardware memory failure.
  *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
@@ -777,7 +780,9 @@ madvise_behavior_valid(int behavior)
  *  zero    - success
  *  -EINVAL - start + len < 0, start is not page-aligned,
  *		"behavior" is not a valid value, or application
- *		is attempting to release locked or shared pages.
+ *		is attempting to release locked or shared pages,
+ *		or the specified address range includes file, Huge TLB,
+ *		MAP_SHARED or VMPFNMAP range.
  *  -ENOMEM - addresses in the specified range are not currently
  *		mapped, or are outside the AS of the process.
  *  -EIO    - an I/O error occurred while paging in data.
-- 
cgit v1.2.3


From de55c8b251974247edda38e952da8e8dd71683ec Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 13 Oct 2017 15:57:43 -0700
Subject: mm/mempolicy: fix NUMA_INTERLEAVE_HIT counter

Commit 3a321d2a3dde ("mm: change the call sites of numa statistics
items") separated NUMA counters from zone counters, but the
NUMA_INTERLEAVE_HIT call site wasn't updated to use the new interface.
So alloc_page_interleave() actually increments NR_ZONE_INACTIVE_FILE
instead of NUMA_INTERLEAVE_HIT.

Fix this by using __inc_numa_state() interface to increment
NUMA_INTERLEAVE_HIT.

Link: http://lkml.kernel.org/r/20171003191003.8573-1-aryabinin@virtuozzo.com
Fixes: 3a321d2a3dde ("mm: change the call sites of numa statistics items")
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Kemi Wang <kemi.wang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 006ba625c0b8..a2af6d58a68f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1920,8 +1920,11 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 	struct page *page;
 
 	page = __alloc_pages(gfp, order, nid);
-	if (page && page_to_nid(page) == nid)
-		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
+	if (page && page_to_nid(page) == nid) {
+		preempt_disable();
+		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
+		preempt_enable();
+	}
 	return page;
 }
 
-- 
cgit v1.2.3


From af0db981f35ea99b00a0b249bf0bedef8cf972e8 Mon Sep 17 00:00:00 2001
From: Zi Yan <zi.yan@cs.rutgers.edu>
Date: Fri, 13 Oct 2017 15:57:47 -0700
Subject: mm: remove unnecessary WARN_ONCE in page_vma_mapped_walk().

A non present pmd entry can appear after pmd_lock is taken in
page_vma_mapped_walk(), even if THP migration is not enabled.  The
WARN_ONCE is unnecessary.

Link: http://lkml.kernel.org/r/20171003142606.12324-1-zi.yan@sent.com
Fixes: 616b8371539a ("mm: thp: enable thp migration in generic path")
Signed-off-by: Zi Yan <zi.yan@cs.rutgers.edu>
Reported-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_vma_mapped.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 6a03946469a9..eb462e7db0a9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -167,8 +167,7 @@ restart:
 						return not_found(pvmw);
 					return true;
 				}
-			} else
-				WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
+			}
 			return not_found(pvmw);
 		} else {
 			/* THP pmd was split under us: handle on pte level */
-- 
cgit v1.2.3


From ef4650144e76ae361fe4b8c9a0afcd53074cd520 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@free-electrons.com>
Date: Fri, 13 Oct 2017 15:58:01 -0700
Subject: mm/cma.c: take __GFP_NOWARN into account in cma_alloc()

cma_alloc() unconditionally prints an INFO message when the CMA
allocation fails.  Make this message conditional on the non-presence of
__GFP_NOWARN in gfp_mask.

This patch aims at removing INFO messages that are displayed when the
VC4 driver tries to allocate buffer objects.  From the driver
perspective an allocation failure is acceptable, and the driver can
possibly do something to make following allocation succeed (like
flushing the VC4 internal cache).

Link: http://lkml.kernel.org/r/20171004125447.15195-1-boris.brezillon@free-electrons.com
Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Acked-by: Laura Abbott <labbott@redhat.com>
Cc: Jaewon Kim <jaewon31.kim@samsung.com>
Cc: David Airlie <airlied@linux.ie>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Eric Anholt <eric@anholt.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/cma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/cma.c b/mm/cma.c
index c0da318c020e..022e52bd8370 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -460,7 +460,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 
 	trace_cma_alloc(pfn, page, count, align);
 
-	if (ret) {
+	if (ret && !(gfp_mask & __GFP_NOWARN)) {
 		pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n",
 			__func__, count, ret);
 		cma_debug_show_areas(cma);
-- 
cgit v1.2.3


From b8c8a338f75e052d9fa2fed851259320af412e3f Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 13 Oct 2017 15:58:05 -0700
Subject: Revert "vmalloc: back off when the current task is killed"

This reverts commits 5d17a73a2ebe ("vmalloc: back off when the current
task is killed") and 171012f56127 ("mm: don't warn when vmalloc() fails
due to a fatal signal").

Commit 5d17a73a2ebe ("vmalloc: back off when the current task is
killed") made all vmalloc allocations from a signal-killed task fail.
We have seen crashes in the tty driver from this, where a killed task
exiting tries to switch back to N_TTY, fails n_tty_open because of the
vmalloc failing, and later crashes when dereferencing tty->disc_data.

Arguably, relying on a vmalloc() call to succeed in order to properly
exit a task is not the most robust way of doing things.  There will be a
follow-up patch to the tty code to fall back to the N_NULL ldisc.

But the justification to make that vmalloc() call fail like this isn't
convincing, either.  The patch mentions an OOM victim exhausting the
memory reserves and thus deadlocking the machine.  But the OOM killer is
only one, improbable source of fatal signals.  It doesn't make sense to
fail allocations preemptively with plenty of memory in most cases.

The patch doesn't mention real-life instances where vmalloc sites would
exhaust memory, which makes it sound more like a theoretical issue to
begin with.  But just in case, the OOM access to memory reserves has
been restricted on the allocator side in cd04ae1e2dc8 ("mm, oom: do not
rely on TIF_MEMDIE for memory reserves access"), which should take care
of any theoretical concerns on that front.

Revert this patch, and the follow-up that suppresses the allocation
warnings when we fail the allocations due to a signal.

Link: http://lkml.kernel.org/r/20171004185906.GB2136@cmpxchg.org
Fixes:  171012f56127 ("mm: don't warn when vmalloc() fails due to a fatal signal")
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Alan Cox <alan@llwyncelyn.cymru>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8a43db6284eb..673942094328 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1695,11 +1695,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	for (i = 0; i < area->nr_pages; i++) {
 		struct page *page;
 
-		if (fatal_signal_pending(current)) {
-			area->nr_pages = i;
-			goto fail_no_warn;
-		}
-
 		if (node == NUMA_NO_NODE)
 			page = alloc_page(alloc_mask|highmem_mask);
 		else
@@ -1723,7 +1718,6 @@ fail:
 	warn_alloc(gfp_mask, NULL,
 			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
 			  (area->nr_pages*PAGE_SIZE), area->size);
-fail_no_warn:
 	vfree(area->addr);
 	return NULL;
 }
-- 
cgit v1.2.3


From a7b100953aa33a5bbdc3e5e7f2241b9c0704606e Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 13 Oct 2017 15:58:25 -0700
Subject: mm: page_vma_mapped: ensure pmd is loaded with READ_ONCE outside of
 lock

Loading the pmd without holding the pmd_lock exposes us to races with
concurrent updaters of the page tables but, worse still, it also allows
the compiler to cache the pmd value in a register and reuse it later on,
even if we've performed a READ_ONCE in between and seen a more recent
value.

In the case of page_vma_mapped_walk, this leads to the following crash
when the pmd loaded for the initial pmd_trans_huge check is all zeroes
and a subsequent valid table entry is loaded by check_pmd.  We then
proceed into map_pte, but the compiler re-uses the zero entry inside
pte_offset_map, resulting in a junk pointer being installed in
pvmw->pte:

  PC is at check_pte+0x20/0x170
  LR is at page_vma_mapped_walk+0x2e0/0x540
  [...]
  Process doio (pid: 2463, stack limit = 0xffff00000f2e8000)
  Call trace:
    check_pte+0x20/0x170
    page_vma_mapped_walk+0x2e0/0x540
    page_mkclean_one+0xac/0x278
    rmap_walk_file+0xf0/0x238
    rmap_walk+0x64/0xa0
    page_mkclean+0x90/0xa8
    clear_page_dirty_for_io+0x84/0x2a8
    mpage_submit_page+0x34/0x98
    mpage_process_page_bufs+0x164/0x170
    mpage_prepare_extent_to_map+0x134/0x2b8
    ext4_writepages+0x484/0xe30
    do_writepages+0x44/0xe8
    __filemap_fdatawrite_range+0xbc/0x110
    file_write_and_wait_range+0x48/0xd8
    ext4_sync_file+0x80/0x4b8
    vfs_fsync_range+0x64/0xc0
    SyS_msync+0x194/0x1e8

This patch fixes the problem by ensuring that READ_ONCE is used before
the initial checks on the pmd, and this value is subsequently used when
checking whether or not the pmd is present.  pmd_check is removed and
the pmd_present check is inlined directly.

Link: http://lkml.kernel.org/r/1507222630-5839-1-git-send-email-will.deacon@arm.com
Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use page_vma_mapped_walk()")
Signed-off-by: Will Deacon <will.deacon@arm.com>
Tested-by: Yury Norov <ynorov@caviumnetworks.com>
Tested-by: Richard Ruigrok <rruigrok@codeaurora.org>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_vma_mapped.c | 25 ++++++++++---------------
 1 file changed, 10 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eb462e7db0a9..53afbb919a1c 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -6,17 +6,6 @@
 
 #include "internal.h"
 
-static inline bool check_pmd(struct page_vma_mapped_walk *pvmw)
-{
-	pmd_t pmde;
-	/*
-	 * Make sure we don't re-load pmd between present and !trans_huge check.
-	 * We need a consistent view.
-	 */
-	pmde = READ_ONCE(*pvmw->pmd);
-	return pmd_present(pmde) && !pmd_trans_huge(pmde);
-}
-
 static inline bool not_found(struct page_vma_mapped_walk *pvmw)
 {
 	page_vma_mapped_walk_done(pvmw);
@@ -116,6 +105,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
+	pmd_t pmde;
 
 	/* The only possible pmd mapping has been handled on last iteration */
 	if (pvmw->pmd && !pvmw->pte)
@@ -148,7 +138,13 @@ restart:
 	if (!pud_present(*pud))
 		return false;
 	pvmw->pmd = pmd_offset(pud, pvmw->address);
-	if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) {
+	/*
+	 * Make sure the pmd value isn't cached in a register by the
+	 * compiler and used as a stale value after we've observed a
+	 * subsequent update.
+	 */
+	pmde = READ_ONCE(*pvmw->pmd);
+	if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
 		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
 		if (likely(pmd_trans_huge(*pvmw->pmd))) {
 			if (pvmw->flags & PVMW_MIGRATION)
@@ -174,9 +170,8 @@ restart:
 			spin_unlock(pvmw->ptl);
 			pvmw->ptl = NULL;
 		}
-	} else {
-		if (!check_pmd(pvmw))
-			return false;
+	} else if (!pmd_present(pmde)) {
+		return false;
 	}
 	if (!map_pte(pvmw))
 		goto next_pte;
-- 
cgit v1.2.3


From 61b639723be5a9fc4812d5d85cb769589afa5a38 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Fri, 13 Oct 2017 15:58:29 -0700
Subject: mm, swap: use page-cluster as max window of VMA based swap readahead

When the VMA based swap readahead was introduced, a new knob

  /sys/kernel/mm/swap/vma_ra_max_order

was added as the max window of VMA swap readahead.  This is to make it
possible to use different max window for VMA based readahead and
original physical readahead.  But Minchan Kim pointed out that this will
cause a regression because setting page-cluster sysctl to zero cannot
disable swap readahead with the change.

To fix the regression, the page-cluster sysctl is used as the max window
of both the VMA based swap readahead and original physical swap
readahead.  If more fine grained control is needed in the future, more
knobs can be added as the subordinate knobs of the page-cluster sysctl.

The vma_ra_max_order knob is deleted.  Because the knob was introduced
in v4.14-rc1, and this patch is targeting being merged before v4.14
releasing, there should be no existing users of this newly added ABI.

Link: http://lkml.kernel.org/r/20171011070847.16003-1-ying.huang@intel.com
Fixes: ec560175c0b6fce ("mm, swap: VMA based swap readahead")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Reported-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_state.c | 41 +++++++----------------------------------
 1 file changed, 7 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/mm/swap_state.c b/mm/swap_state.c
index ed91091d1e68..05b6803f0cce 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,10 +39,6 @@ struct address_space *swapper_spaces[MAX_SWAPFILES];
 static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
 bool swap_vma_readahead = true;
 
-#define SWAP_RA_MAX_ORDER_DEFAULT	3
-
-static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;
-
 #define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
 #define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
 #define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
@@ -664,6 +660,13 @@ struct page *swap_readahead_detect(struct vm_fault *vmf,
 	pte_t *tpte;
 #endif
 
+	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
+			     SWAP_RA_ORDER_CEILING);
+	if (max_win == 1) {
+		swap_ra->win = 1;
+		return NULL;
+	}
+
 	faddr = vmf->address;
 	entry = pte_to_swp_entry(vmf->orig_pte);
 	if ((unlikely(non_swap_entry(entry))))
@@ -672,12 +675,6 @@ struct page *swap_readahead_detect(struct vm_fault *vmf,
 	if (page)
 		return page;
 
-	max_win = 1 << READ_ONCE(swap_ra_max_order);
-	if (max_win == 1) {
-		swap_ra->win = 1;
-		return NULL;
-	}
-
 	fpfn = PFN_DOWN(faddr);
 	swap_ra_info = GET_SWAP_RA_VAL(vma);
 	pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
@@ -786,32 +783,8 @@ static struct kobj_attribute vma_ra_enabled_attr =
 	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
 	       vma_ra_enabled_store);
 
-static ssize_t vma_ra_max_order_show(struct kobject *kobj,
-				     struct kobj_attribute *attr, char *buf)
-{
-	return sprintf(buf, "%d\n", swap_ra_max_order);
-}
-static ssize_t vma_ra_max_order_store(struct kobject *kobj,
-				      struct kobj_attribute *attr,
-				      const char *buf, size_t count)
-{
-	int err, v;
-
-	err = kstrtoint(buf, 10, &v);
-	if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
-		return -EINVAL;
-
-	swap_ra_max_order = v;
-
-	return count;
-}
-static struct kobj_attribute vma_ra_max_order_attr =
-	__ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
-	       vma_ra_max_order_store);
-
 static struct attribute *swap_attrs[] = {
 	&vma_ra_enabled_attr.attr,
-	&vma_ra_max_order_attr.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From 0ea7eeec24be5f04ae80d68f5b1ea3a11f49de2f Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 17 Oct 2017 16:55:52 +0200
Subject: mm, percpu: add support for __GFP_NOWARN flag

Add an option for pcpu_alloc() to support __GFP_NOWARN flag.
Currently, we always throw a warning when size or alignment
is unsupported (and also dump stack on failed allocation
requests). The warning itself is harmless since we return
NULL anyway for any failed request, which callers are
required to handle anyway. However, it becomes harmful when
panic_on_warn is set.

The rationale for the WARN() in pcpu_alloc() is that it can
be tracked when larger than supported allocation requests are
made such that allocations limits can be tweaked if warranted.
This makes sense for in-kernel users, however, there are users
of pcpu allocator where allocation size is derived from user
space requests, e.g. when creating BPF maps. In these cases,
the requests should fail gracefully without throwing a splat.

The current work-around was to check allocation size against
the upper limit of PCPU_MIN_UNIT_SIZE from call-sites for
bailing out prior to a call to pcpu_alloc() in order to
avoid throwing the WARN(). This is bad in multiple ways since
PCPU_MIN_UNIT_SIZE is an implementation detail, and having
the checks on call-sites only complicates the code for no
good reason. Thus, lets fix it generically by supporting the
__GFP_NOWARN flag that users can then use with calling the
__alloc_percpu_gfp() helper instead.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 mm/percpu.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/percpu.c b/mm/percpu.c
index aa121cef76de..a0e0c82c1e4c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1329,7 +1329,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  * @gfp: allocation flags
  *
  * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
- * contain %GFP_KERNEL, the allocation is atomic.
+ * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
+ * then no warning will be triggered on invalid or failed allocation
+ * requests.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
@@ -1337,10 +1339,11 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 				 gfp_t gfp)
 {
+	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+	bool do_warn = !(gfp & __GFP_NOWARN);
 	static int warn_limit = 10;
 	struct pcpu_chunk *chunk;
 	const char *err;
-	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
 	int slot, off, cpu, ret;
 	unsigned long flags;
 	void __percpu *ptr;
@@ -1361,7 +1364,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
 		     !is_power_of_2(align))) {
-		WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
 		     size, align);
 		return NULL;
 	}
@@ -1482,7 +1485,7 @@ fail_unlock:
 fail:
 	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
 
-	if (!is_atomic && warn_limit) {
+	if (!is_atomic && do_warn && warn_limit) {
 		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
 			size, align, is_atomic, err);
 		dump_stack();
@@ -1507,7 +1510,9 @@ fail:
  *
  * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
  * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
- * be called from any context but is a lot more likely to fail.
+ * be called from any context but is a lot more likely to fail. If @gfp
+ * has __GFP_NOWARN then no warning will be triggered on invalid or failed
+ * allocation requests.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
-- 
cgit v1.2.3