From 7f36e3e56db1ae75d1e157011b3cb2e0957f0a7e Mon Sep 17 00:00:00 2001
From: Tang Chen <tangchen@cn.fujitsu.com>
Date: Fri, 4 Sep 2015 15:42:32 -0700
Subject: memory-hotplug: add hot-added memory ranges to memblock before
 allocate node_data for a node.

Commit f9126ab9241f ("memory-hotplug: fix wrong edge when hot add a new
node") hot-added memory range to memblock, after creating pgdat for new
node.

But there is a problem:

  add_memory()
  |--> hotadd_new_pgdat()
       |--> free_area_init_node()
            |--> get_pfn_range_for_nid()
                 |--> find start_pfn and end_pfn in memblock
  |--> ......
  |--> memblock_add_node(start, size, nid)    --------    Here, just too late.

get_pfn_range_for_nid() will find that start_pfn and end_pfn are both 0.
As a result, when adding memory, dmesg will give the following wrong
message.

  Initmem setup node 5 [mem 0x0000000000000000-0xffffffffffffffff]
  On node 5 totalpages: 0
  Built 5 zonelists in Node order, mobility grouping on.  Total pages: 32588823
  Policy zone: Normal
  init_memory_mapping: [mem 0x60000000000-0x607ffffffff]

The solution is simple, just add the memory range to memblock a little
earlier, before hotadd_new_pgdat().

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Taku Izumi <izumi.taku@jp.fujitsu.com>
Cc: Gu Zheng <guz.fnst@cn.fujitsu.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>	[4.2.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6da82bcb0a8b..8fd97dac538a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1248,6 +1248,14 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
 	mem_hotplug_begin();
 
+	/*
+	 * Add new range to memblock so that when hotadd_new_pgdat() is called
+	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
+	 * this new range and calculate total pages correctly.  The range will
+	 * be removed at hot-remove time.
+	 */
+	memblock_add_node(start, size, nid);
+
 	new_node = !node_online(nid);
 	if (new_node) {
 		pgdat = hotadd_new_pgdat(nid, start);
@@ -1277,7 +1285,6 @@ int __ref add_memory(int nid, u64 start, u64 size)
 
 	/* create new memmap entry */
 	firmware_map_add_hotplug(start, start + size, "System RAM");
-	memblock_add_node(start, size, nid);
 
 	goto out;
 
@@ -1286,6 +1293,7 @@ error:
 	if (new_pgdat)
 		rollback_node_hotadd(nid, pgdat);
 	release_memory_resource(res);
+	memblock_remove(start, size);
 
 out:
 	mem_hotplug_done();
-- 
cgit v1.2.3


From aa1057b3dec478b20c77bad07442318ae36d893c Mon Sep 17 00:00:00 2001
From: Ryan Ding <ryan.ding@oracle.com>
Date: Fri, 4 Sep 2015 15:42:36 -0700
Subject: ocfs2: direct write will call ocfs2_rw_unlock() twice when doing
 aio+dio

ocfs2_file_write_iter() is usng the wrong return value ('written').  This
will cause ocfs2_rw_unlock() be called both in write_iter & end_io,
triggering a BUG_ON.

This issue was introduced by commit 7da839c47589 ("ocfs2: use
__generic_file_write_iter()").

Orabug: 21612107
Fixes: 7da839c47589 ("ocfs2: use __generic_file_write_iter()")
Signed-off-by: Ryan Ding <ryan.ding@oracle.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7210583b472f..2eb11363b1f7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2378,6 +2378,20 @@ relock:
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
 
+	/*
+	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
+	 * function pointer which is called when o_direct io completes so that
+	 * it can unlock our rw lock.
+	 * Unfortunately there are error cases which call end_io and others
+	 * that don't.  so we don't have to unlock the rw_lock if either an
+	 * async dio is going to do it in the future or an end_io after an
+	 * error has already done it.
+	 */
+	if ((written == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
+		rw_level = -1;
+		unaligned_dio = 0;
+	}
+
 	if (unlikely(written <= 0))
 		goto no_sync;
 
@@ -2402,20 +2416,6 @@ relock:
 	}
 
 no_sync:
-	/*
-	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
-	 * function pointer which is called when o_direct io completes so that
-	 * it can unlock our rw lock.
-	 * Unfortunately there are error cases which call end_io and others
-	 * that don't.  so we don't have to unlock the rw_lock if either an
-	 * async dio is going to do it in the future or an end_io after an
-	 * error has already done it.
-	 */
-	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
-		rw_level = -1;
-		unaligned_dio = 0;
-	}
-
 	if (unaligned_dio) {
 		ocfs2_iocb_clear_unaligned_aio(iocb);
 		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
-- 
cgit v1.2.3


From 04697858d89e4bf2650364f8d6956e2554e8ef88 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 4 Sep 2015 15:42:39 -0700
Subject: mm: check if section present during memory block registering

Tony Luck found on his setup, if memory block size 512M will cause crash
during booting.

  BUG: unable to handle kernel paging request at ffffea0074000020
  IP: get_nid_for_pfn+0x17/0x40
  PGD 128ffcb067 PUD 128ffc9067 PMD 0
  Oops: 0000 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc8 #1
  ...
  Call Trace:
     ? register_mem_sect_under_node+0x66/0xe0
     register_one_node+0x17b/0x240
     ? pci_iommu_alloc+0x6e/0x6e
     topology_init+0x3c/0x95
     do_one_initcall+0xcd/0x1f0

The system has non continuous RAM address:
 BIOS-e820: [mem 0x0000001300000000-0x0000001cffffffff] usable
 BIOS-e820: [mem 0x0000001d70000000-0x0000001ec7ffefff] usable
 BIOS-e820: [mem 0x0000001f00000000-0x0000002bffffffff] usable
 BIOS-e820: [mem 0x0000002c18000000-0x0000002d6fffefff] usable
 BIOS-e820: [mem 0x0000002e00000000-0x00000039ffffffff] usable

So there are start sections in memory block not present.  For example:

    memory block : [0x2c18000000, 0x2c20000000) 512M

first three sections are not present.

The current register_mem_sect_under_node() assume first section is
present, but memory block section number range [start_section_nr,
end_section_nr] would include not present section.

For arch that support vmemmap, we don't setup memmap for struct page
area within not present sections area.

So skip the pfn range that belong to absent section.

[akpm@linux-foundation.org: simplification]
[rientjes@google.com: more simplification]
Fixes: bdee237c0343 ("x86: mm: Use 2GB memory block size on large memory x86-64 systems")
Fixes: 982792c782ef ("x86, mm: probe memory block size for generic x86 64bit")
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Tony Luck <tony.luck@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Cc: Greg KH <greg@kroah.com>
Cc: Ingo Molnar <mingo@elte.hu>
Tested-by: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>	[3.15+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 31df474d72f4..560751bad294 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -392,6 +392,16 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
 	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
 		int page_nid;
 
+		/*
+		 * memory block could have several absent sections from start.
+		 * skip pfn range from absent section
+		 */
+		if (!pfn_present(pfn)) {
+			pfn = round_down(pfn + PAGES_PER_SECTION,
+					 PAGES_PER_SECTION) - 1;
+			continue;
+		}
+
 		page_nid = get_nid_for_pfn(pfn);
 		if (page_nid < 0)
 			continue;
-- 
cgit v1.2.3


From e9f069868d60550c4b46f084ac9276a57c1b4711 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 4 Sep 2015 15:42:42 -0700
Subject: kernel/kthread.c:kthread_create_on_node(): clarify documentation

- Make it clear that the `node' arg refers to memory allocations only:
  kthread_create_on_node() does not pin the new thread to that node's
  CPUs.

- Encourage the use of NUMA_NO_NODE.

[nzimmer@sgi.com: use NUMA_NO_NODE in kthread_create() also]
Cc: Nathan Zimmer <nzimmer@sgi.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 869b21dcf503..e691b6a23f72 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -11,7 +11,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   const char namefmt[], ...);
 
 #define kthread_create(threadfn, data, namefmt, arg...) \
-	kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
+	kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
 
 
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
  * kthread_create_on_node - create a kthread.
  * @threadfn: the function to run until signal_pending(current).
  * @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
  * @namefmt: printf-style name for the thread.
  *
  * Description: This helper function creates and names a kernel
  * thread.  The thread will be stopped: use wake_up_process() to start
- * it.  See also kthread_run().
+ * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
  *
  * If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
  * When woken, the thread will run @threadfn() with @data as its
  * argument. @threadfn() can either call do_exit() directly if it is a
  * standalone thread for which no one will call kthread_stop(), or
-- 
cgit v1.2.3


From 58319057b7847667f0c9585b9de0e8932b0fdb08 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:45 -0700
Subject: capabilities: ambient capabilities

Credit where credit is due: this idea comes from Christoph Lameter with
a lot of valuable input from Serge Hallyn.  This patch is heavily based
on Christoph's patch.

===== The status quo =====

On Linux, there are a number of capabilities defined by the kernel.  To
perform various privileged tasks, processes can wield capabilities that
they hold.

Each task has four capability masks: effective (pE), permitted (pP),
inheritable (pI), and a bounding set (X).  When the kernel checks for a
capability, it checks pE.  The other capability masks serve to modify
what capabilities can be in pE.

Any task can remove capabilities from pE, pP, or pI at any time.  If a
task has a capability in pP, it can add that capability to pE and/or pI.
If a task has CAP_SETPCAP, then it can add any capability to pI, and it
can remove capabilities from X.

Tasks are not the only things that can have capabilities; files can also
have capabilities.  A file can have no capabilty information at all [1].
If a file has capability information, then it has a permitted mask (fP)
and an inheritable mask (fI) as well as a single effective bit (fE) [2].
File capabilities modify the capabilities of tasks that execve(2) them.

A task that successfully calls execve has its capabilities modified for
the file ultimately being excecuted (i.e.  the binary itself if that
binary is ELF or for the interpreter if the binary is a script.) [3] In
the capability evolution rules, for each mask Z, pZ represents the old
value and pZ' represents the new value.  The rules are:

  pP' = (X & fP) | (pI & fI)
  pI' = pI
  pE' = (fE ? pP' : 0)
  X is unchanged

For setuid binaries, fP, fI, and fE are modified by a moderately
complicated set of rules that emulate POSIX behavior.  Similarly, if
euid == 0 or ruid == 0, then fP, fI, and fE are modified differently
(primary, fP and fI usually end up being the full set).  For nonroot
users executing binaries with neither setuid nor file caps, fI and fP
are empty and fE is false.

As an extra complication, if you execute a process as nonroot and fE is
set, then the "secure exec" rules are in effect: AT_SECURE gets set,
LD_PRELOAD doesn't work, etc.

This is rather messy.  We've learned that making any changes is
dangerous, though: if a new kernel version allows an unprivileged
program to change its security state in a way that persists cross
execution of a setuid program or a program with file caps, this
persistent state is surprisingly likely to allow setuid or file-capped
programs to be exploited for privilege escalation.

===== The problem =====

Capability inheritance is basically useless.

If you aren't root and you execute an ordinary binary, fI is zero, so
your capabilities have no effect whatsoever on pP'.  This means that you
can't usefully execute a helper process or a shell command with elevated
capabilities if you aren't root.

On current kernels, you can sort of work around this by setting fI to
the full set for most or all non-setuid executable files.  This causes
pP' = pI for nonroot, and inheritance works.  No one does this because
it's a PITA and it isn't even supported on most filesystems.

If you try this, you'll discover that every nonroot program ends up with
secure exec rules, breaking many things.

This is a problem that has bitten many people who have tried to use
capabilities for anything useful.

===== The proposed change =====

This patch adds a fifth capability mask called the ambient mask (pA).
pA does what most people expect pI to do.

pA obeys the invariant that no bit can ever be set in pA if it is not
set in both pP and pI.  Dropping a bit from pP or pI drops that bit from
pA.  This ensures that existing programs that try to drop capabilities
still do so, with a complication.  Because capability inheritance is so
broken, setting KEEPCAPS, using setresuid to switch to nonroot uids, and
then calling execve effectively drops capabilities.  Therefore,
setresuid from root to nonroot conditionally clears pA unless
SECBIT_NO_SETUID_FIXUP is set.  Processes that don't like this can
re-add bits to pA afterwards.

The capability evolution rules are changed:

  pA' = (file caps or setuid or setgid ? 0 : pA)
  pP' = (X & fP) | (pI & fI) | pA'
  pI' = pI
  pE' = (fE ? pP' : pA')
  X is unchanged

If you are nonroot but you have a capability, you can add it to pA.  If
you do so, your children get that capability in pA, pP, and pE.  For
example, you can set pA = CAP_NET_BIND_SERVICE, and your children can
automatically bind low-numbered ports.  Hallelujah!

Unprivileged users can create user namespaces, map themselves to a
nonzero uid, and create both privileged (relative to their namespace)
and unprivileged process trees.  This is currently more or less
impossible.  Hallelujah!

You cannot use pA to try to subvert a setuid, setgid, or file-capped
program: if you execute any such program, pA gets cleared and the
resulting evolution rules are unchanged by this patch.

Users with nonzero pA are unlikely to unintentionally leak that
capability.  If they run programs that try to drop privileges, dropping
privileges will still work.

It's worth noting that the degree of paranoia in this patch could
possibly be reduced without causing serious problems.  Specifically, if
we allowed pA to persist across executing non-pA-aware setuid binaries
and across setresuid, then, naively, the only capabilities that could
leak as a result would be the capabilities in pA, and any attacker
*already* has those capabilities.  This would make me nervous, though --
setuid binaries that tried to privilege-separate might fail to do so,
and putting CAP_DAC_READ_SEARCH or CAP_DAC_OVERRIDE into pA could have
unexpected side effects.  (Whether these unexpected side effects would
be exploitable is an open question.) I've therefore taken the more
paranoid route.  We can revisit this later.

An alternative would be to require PR_SET_NO_NEW_PRIVS before setting
ambient capabilities.  I think that this would be annoying and would
make granting otherwise unprivileged users minor ambient capabilities
(CAP_NET_BIND_SERVICE or CAP_NET_RAW for example) much less useful than
it is with this patch.

===== Footnotes =====

[1] Files that are missing the "security.capability" xattr or that have
unrecognized values for that xattr end up with has_cap set to false.
The code that does that appears to be complicated for no good reason.

[2] The libcap capability mask parsers and formatters are dangerously
misleading and the documentation is flat-out wrong.  fE is *not* a mask;
it's a single bit.  This has probably confused every single person who
has tried to use file capabilities.

[3] Linux very confusingly processes both the script and the interpreter
if applicable, for reasons that elude me.  The results from thinking
about a script's file capabilities and/or setuid bits are mostly
discarded.

Preliminary userspace code is here, but it needs updating:
https://git.kernel.org/cgit/linux/kernel/git/luto/util-linux-playground.git/commit/?h=cap_ambient&id=7f5afbd175d2

Here is a test program that can be used to verify the functionality
(from Christoph):

/*
 * Test program for the ambient capabilities. This program spawns a shell
 * that allows running processes with a defined set of capabilities.
 *
 * (C) 2015 Christoph Lameter <cl@linux.com>
 * Released under: GPL v3 or later.
 *
 *
 * Compile using:
 *
 *	gcc -o ambient_test ambient_test.o -lcap-ng
 *
 * This program must have the following capabilities to run properly:
 * Permissions for CAP_NET_RAW, CAP_NET_ADMIN, CAP_SYS_NICE
 *
 * A command to equip the binary with the right caps is:
 *
 *	setcap cap_net_raw,cap_net_admin,cap_sys_nice+p ambient_test
 *
 *
 * To get a shell with additional caps that can be inherited by other processes:
 *
 *	./ambient_test /bin/bash
 *
 *
 * Verifying that it works:
 *
 * From the bash spawed by ambient_test run
 *
 *	cat /proc/$$/status
 *
 * and have a look at the capabilities.
 */

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <cap-ng.h>
#include <sys/prctl.h>
#include <linux/capability.h>

/*
 * Definitions from the kernel header files. These are going to be removed
 * when the /usr/include files have these defined.
 */
#define PR_CAP_AMBIENT 47
#define PR_CAP_AMBIENT_IS_SET 1
#define PR_CAP_AMBIENT_RAISE 2
#define PR_CAP_AMBIENT_LOWER 3
#define PR_CAP_AMBIENT_CLEAR_ALL 4

static void set_ambient_cap(int cap)
{
	int rc;

	capng_get_caps_process();
	rc = capng_update(CAPNG_ADD, CAPNG_INHERITABLE, cap);
	if (rc) {
		printf("Cannot add inheritable cap\n");
		exit(2);
	}
	capng_apply(CAPNG_SELECT_CAPS);

	/* Note the two 0s at the end. Kernel checks for these */
	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, cap, 0, 0)) {
		perror("Cannot set cap");
		exit(1);
	}
}

int main(int argc, char **argv)
{
	int rc;

	set_ambient_cap(CAP_NET_RAW);
	set_ambient_cap(CAP_NET_ADMIN);
	set_ambient_cap(CAP_SYS_NICE);

	printf("Ambient_test forking shell\n");
	if (execv(argv[1], argv + 1))
		perror("Cannot exec");

	return 0;
}

Signed-off-by: Christoph Lameter <cl@linux.com> # Original author
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/array.c              |   5 ++-
 include/linux/cred.h         |   8 ++++
 include/uapi/linux/prctl.h   |   7 +++
 kernel/user_namespace.c      |   1 +
 security/commoncap.c         | 102 ++++++++++++++++++++++++++++++++++++++-----
 security/keys/process_keys.c |   1 +
 6 files changed, 113 insertions(+), 11 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..f60f0121e331 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
 static inline void task_cap(struct seq_file *m, struct task_struct *p)
 {
 	const struct cred *cred;
-	kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset;
+	kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
+			cap_bset, cap_ambient;
 
 	rcu_read_lock();
 	cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	cap_permitted	= cred->cap_permitted;
 	cap_effective	= cred->cap_effective;
 	cap_bset	= cred->cap_bset;
+	cap_ambient	= cred->cap_ambient;
 	rcu_read_unlock();
 
 	render_cap_t(m, "CapInh:\t", &cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &cap_permitted);
 	render_cap_t(m, "CapEff:\t", &cap_effective);
 	render_cap_t(m, "CapBnd:\t", &cap_bset);
+	render_cap_t(m, "CapAmb:\t", &cap_ambient);
 }
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 8b6c083e68a7..8d70e1361ecd 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -137,6 +137,7 @@ struct cred {
 	kernel_cap_t	cap_permitted;	/* caps we're permitted */
 	kernel_cap_t	cap_effective;	/* caps we can actually use */
 	kernel_cap_t	cap_bset;	/* capability bounding set */
+	kernel_cap_t	cap_ambient;	/* Ambient capability set */
 #ifdef CONFIG_KEYS
 	unsigned char	jit_keyring;	/* default keyring to attach requested
 					 * keys to */
@@ -212,6 +213,13 @@ static inline void validate_process_creds(void)
 }
 #endif
 
+static inline bool cap_ambient_invariant_ok(const struct cred *cred)
+{
+	return cap_issubset(cred->cap_ambient,
+			    cap_intersect(cred->cap_permitted,
+					  cred->cap_inheritable));
+}
+
 /**
  * get_new_cred - Get a reference on a new set of credentials
  * @cred: The new credentials to reference
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..a8d0759a9e40 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,11 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR		(1 << 0)	/* 64b FP registers */
 # define PR_FP_MODE_FRE		(1 << 1)	/* 32b compatibility */
 
+/* Control the ambient capability set */
+#define PR_CAP_AMBIENT			47
+# define PR_CAP_AMBIENT_IS_SET		1
+# define PR_CAP_AMBIENT_RAISE		2
+# define PR_CAP_AMBIENT_LOWER		3
+# define PR_CAP_AMBIENT_CLEAR_ALL	4
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 	cred->cap_inheritable = CAP_EMPTY_SET;
 	cred->cap_permitted = CAP_FULL_SET;
 	cred->cap_effective = CAP_FULL_SET;
+	cred->cap_ambient = CAP_EMPTY_SET;
 	cred->cap_bset = CAP_FULL_SET;
 #ifdef CONFIG_KEYS
 	key_put(cred->request_key_auth);
diff --git a/security/commoncap.c b/security/commoncap.c
index d103f5a4043d..1f74dde1063e 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -267,6 +267,16 @@ int cap_capset(struct cred *new,
 	new->cap_effective   = *effective;
 	new->cap_inheritable = *inheritable;
 	new->cap_permitted   = *permitted;
+
+	/*
+	 * Mask off ambient bits that are no longer both permitted and
+	 * inheritable.
+	 */
+	new->cap_ambient = cap_intersect(new->cap_ambient,
+					 cap_intersect(*permitted,
+						       *inheritable));
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EINVAL;
 	return 0;
 }
 
@@ -347,6 +357,7 @@ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
 
 		/*
 		 * pP' = (X & fP) | (pI & fI)
+		 * The addition of pA' is handled later.
 		 */
 		new->cap_permitted.cap[i] =
 			(new->cap_bset.cap[i] & permitted) |
@@ -474,10 +485,13 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 {
 	const struct cred *old = current_cred();
 	struct cred *new = bprm->cred;
-	bool effective, has_cap = false;
+	bool effective, has_cap = false, is_setid;
 	int ret;
 	kuid_t root_uid;
 
+	if (WARN_ON(!cap_ambient_invariant_ok(old)))
+		return -EPERM;
+
 	effective = false;
 	ret = get_file_caps(bprm, &effective, &has_cap);
 	if (ret < 0)
@@ -522,8 +536,9 @@ skip:
 	 *
 	 * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
 	 */
-	if ((!uid_eq(new->euid, old->uid) ||
-	     !gid_eq(new->egid, old->gid) ||
+	is_setid = !uid_eq(new->euid, old->uid) || !gid_eq(new->egid, old->gid);
+
+	if ((is_setid ||
 	     !cap_issubset(new->cap_permitted, old->cap_permitted)) &&
 	    bprm->unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 		/* downgrade; they get no more than they had, and maybe less */
@@ -539,10 +554,28 @@ skip:
 	new->suid = new->fsuid = new->euid;
 	new->sgid = new->fsgid = new->egid;
 
+	/* File caps or setid cancels ambient. */
+	if (has_cap || is_setid)
+		cap_clear(new->cap_ambient);
+
+	/*
+	 * Now that we've computed pA', update pP' to give:
+	 *   pP' = (X & fP) | (pI & fI) | pA'
+	 */
+	new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);
+
+	/*
+	 * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
+	 * this is the same as pE' = (fE ? pP' : 0) | pA'.
+	 */
 	if (effective)
 		new->cap_effective = new->cap_permitted;
 	else
-		cap_clear(new->cap_effective);
+		new->cap_effective = new->cap_ambient;
+
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EPERM;
+
 	bprm->cap_effective = effective;
 
 	/*
@@ -557,7 +590,7 @@ skip:
 	 * Number 1 above might fail if you don't have a full bset, but I think
 	 * that is interesting information to audit.
 	 */
-	if (!cap_isclear(new->cap_effective)) {
+	if (!cap_issubset(new->cap_effective, new->cap_ambient)) {
 		if (!cap_issubset(CAP_FULL_SET, new->cap_effective) ||
 		    !uid_eq(new->euid, root_uid) || !uid_eq(new->uid, root_uid) ||
 		    issecure(SECURE_NOROOT)) {
@@ -568,6 +601,10 @@ skip:
 	}
 
 	new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
+
+	if (WARN_ON(!cap_ambient_invariant_ok(new)))
+		return -EPERM;
+
 	return 0;
 }
 
@@ -589,7 +626,7 @@ int cap_bprm_secureexec(struct linux_binprm *bprm)
 	if (!uid_eq(cred->uid, root_uid)) {
 		if (bprm->cap_effective)
 			return 1;
-		if (!cap_isclear(cred->cap_permitted))
+		if (!cap_issubset(cred->cap_permitted, cred->cap_ambient))
 			return 1;
 	}
 
@@ -691,10 +728,18 @@ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
 	     uid_eq(old->suid, root_uid)) &&
 	    (!uid_eq(new->uid, root_uid) &&
 	     !uid_eq(new->euid, root_uid) &&
-	     !uid_eq(new->suid, root_uid)) &&
-	    !issecure(SECURE_KEEP_CAPS)) {
-		cap_clear(new->cap_permitted);
-		cap_clear(new->cap_effective);
+	     !uid_eq(new->suid, root_uid))) {
+		if (!issecure(SECURE_KEEP_CAPS)) {
+			cap_clear(new->cap_permitted);
+			cap_clear(new->cap_effective);
+		}
+
+		/*
+		 * Pre-ambient programs expect setresuid to nonroot followed
+		 * by exec to drop capabilities.  We should make sure that
+		 * this remains the case.
+		 */
+		cap_clear(new->cap_ambient);
 	}
 	if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
 		cap_clear(new->cap_effective);
@@ -924,6 +969,43 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
 		return commit_creds(new);
 
+	case PR_CAP_AMBIENT:
+		if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
+			if (arg3 | arg4 | arg5)
+				return -EINVAL;
+
+			new = prepare_creds();
+			if (!new)
+				return -ENOMEM;
+			cap_clear(new->cap_ambient);
+			return commit_creds(new);
+		}
+
+		if (((!cap_valid(arg3)) | arg4 | arg5))
+			return -EINVAL;
+
+		if (arg2 == PR_CAP_AMBIENT_IS_SET) {
+			return !!cap_raised(current_cred()->cap_ambient, arg3);
+		} else if (arg2 != PR_CAP_AMBIENT_RAISE &&
+			   arg2 != PR_CAP_AMBIENT_LOWER) {
+			return -EINVAL;
+		} else {
+			if (arg2 == PR_CAP_AMBIENT_RAISE &&
+			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
+			     !cap_raised(current_cred()->cap_inheritable,
+					 arg3)))
+				return -EPERM;
+
+			new = prepare_creds();
+			if (!new)
+				return -ENOMEM;
+			if (arg2 == PR_CAP_AMBIENT_RAISE)
+				cap_raise(new->cap_ambient, arg3);
+			else
+				cap_lower(new->cap_ambient, arg3);
+			return commit_creds(new);
+		}
+
 	default:
 		/* No functionality available - continue with default */
 		return -ENOSYS;
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index bd536cb221e2..43b4cddbf2b3 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -848,6 +848,7 @@ void key_change_session_keyring(struct callback_head *twork)
 	new->cap_inheritable	= old->cap_inheritable;
 	new->cap_permitted	= old->cap_permitted;
 	new->cap_effective	= old->cap_effective;
+	new->cap_ambient	= old->cap_ambient;
 	new->cap_bset		= old->cap_bset;
 
 	new->jit_keyring	= old->jit_keyring;
-- 
cgit v1.2.3


From 32ae976ed3b5ba39c9208ace41bcdf4157d21db3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:48 -0700
Subject: selftests/capabilities: Add tests for capability evolution

This test focuses on ambient capabilities.  It requires either root or
the ability to create user namespaces.  Some of the test cases will be
skipped for nonroot users.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com> # Original author
Cc: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/capabilities/.gitignore    |   2 +
 tools/testing/selftests/capabilities/Makefile      |  18 +
 tools/testing/selftests/capabilities/test_execve.c | 427 +++++++++++++++++++++
 .../testing/selftests/capabilities/validate_cap.c  |  73 ++++
 4 files changed, 520 insertions(+)
 create mode 100644 tools/testing/selftests/capabilities/.gitignore
 create mode 100644 tools/testing/selftests/capabilities/Makefile
 create mode 100644 tools/testing/selftests/capabilities/test_execve.c
 create mode 100644 tools/testing/selftests/capabilities/validate_cap.c

diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore
new file mode 100644
index 000000000000..b732dd0d4738
--- /dev/null
+++ b/tools/testing/selftests/capabilities/.gitignore
@@ -0,0 +1,2 @@
+test_execve
+validate_cap
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile
new file mode 100644
index 000000000000..8c8f0c1f0889
--- /dev/null
+++ b/tools/testing/selftests/capabilities/Makefile
@@ -0,0 +1,18 @@
+all:
+
+include ../lib.mk
+
+.PHONY: all clean
+
+TARGETS := validate_cap test_execve
+TEST_PROGS := test_execve
+
+CFLAGS := -O2 -g -std=gnu99 -Wall -lcap-ng
+
+all: $(TARGETS)
+
+clean:
+	$(RM) $(TARGETS)
+
+$(TARGETS): %: %.c
+	$(CC) -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
new file mode 100644
index 000000000000..10a21a958aaf
--- /dev/null
+++ b/tools/testing/selftests/capabilities/test_execve.c
@@ -0,0 +1,427 @@
+#define _GNU_SOURCE
+
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <limits.h>
+#include <libgen.h>
+#include <malloc.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT			47
+# define PR_CAP_AMBIENT_IS_SET		1
+# define PR_CAP_AMBIENT_RAISE		2
+# define PR_CAP_AMBIENT_LOWER		3
+# define PR_CAP_AMBIENT_CLEAR_ALL	4
+#endif
+
+static int nerrs;
+
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+	char buf[4096];
+	int fd;
+	ssize_t written;
+	int buf_len;
+
+	buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+	if (buf_len < 0) {
+		err(1, "vsnprintf failed");
+	}
+	if (buf_len >= sizeof(buf)) {
+		errx(1, "vsnprintf output truncated");
+	}
+
+	fd = open(filename, O_WRONLY);
+	if (fd < 0) {
+		if ((errno == ENOENT) && enoent_ok)
+			return;
+		err(1, "open of %s failed", filename);
+	}
+	written = write(fd, buf, buf_len);
+	if (written != buf_len) {
+		if (written >= 0) {
+			errx(1, "short write to %s", filename);
+		} else {
+			err(1, "write to %s failed", filename);
+		}
+	}
+	if (close(fd) != 0) {
+		err(1, "close of %s failed", filename);
+	}
+}
+
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vmaybe_write_file(true, filename, fmt, ap);
+	va_end(ap);
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vmaybe_write_file(false, filename, fmt, ap);
+	va_end(ap);
+}
+
+static bool create_and_enter_ns(uid_t inner_uid)
+{
+	uid_t outer_uid;
+	gid_t outer_gid;
+	int i;
+	bool have_outer_privilege;
+
+	outer_uid = getuid();
+	outer_gid = getgid();
+
+	/*
+	 * TODO: If we're already root, we could skip creating the userns.
+	 */
+
+	if (unshare(CLONE_NEWNS) == 0) {
+		printf("[NOTE]\tUsing global UIDs for tests\n");
+		if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0)
+			err(1, "PR_SET_KEEPCAPS");
+		if (setresuid(inner_uid, inner_uid, -1) != 0)
+			err(1, "setresuid");
+
+		// Re-enable effective caps
+		capng_get_caps_process();
+		for (i = 0; i < CAP_LAST_CAP; i++)
+			if (capng_have_capability(CAPNG_PERMITTED, i))
+				capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
+		if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+			err(1, "capng_apply");
+
+		have_outer_privilege = true;
+	} else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) {
+		printf("[NOTE]\tUsing a user namespace for tests\n");
+		maybe_write_file("/proc/self/setgroups", "deny");
+		write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid);
+		write_file("/proc/self/gid_map", "0 %d 1", outer_gid);
+
+		have_outer_privilege = false;
+	} else {
+		errx(1, "must be root or be able to create a userns");
+	}
+
+	if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0)
+		err(1, "remount everything private");
+
+	return have_outer_privilege;
+}
+
+static void chdir_to_tmpfs(void)
+{
+	char cwd[PATH_MAX];
+	if (getcwd(cwd, sizeof(cwd)) != cwd)
+		err(1, "getcwd");
+
+	if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0)
+		err(1, "mount private tmpfs");
+
+	if (chdir(cwd) != 0)
+		err(1, "chdir to private tmpfs");
+
+	if (umount2(".", MNT_DETACH) != 0)
+		err(1, "detach private tmpfs");
+}
+
+static void copy_fromat_to(int fromfd, const char *fromname, const char *toname)
+{
+	int from = openat(fromfd, fromname, O_RDONLY);
+	if (from == -1)
+		err(1, "open copy source");
+
+	int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700);
+
+	while (true) {
+		char buf[4096];
+		ssize_t sz = read(from, buf, sizeof(buf));
+		if (sz == 0)
+			break;
+		if (sz < 0)
+			err(1, "read");
+
+		if (write(to, buf, sz) != sz)
+			err(1, "write");	/* no short writes on tmpfs */
+	}
+
+	close(from);
+	close(to);
+}
+
+static bool fork_wait(void)
+{
+	pid_t child = fork();
+	if (child == 0) {
+		nerrs = 0;
+		return true;
+	} else if (child > 0) {
+		int status;
+		if (waitpid(child, &status, 0) != child ||
+		    !WIFEXITED(status)) {
+			printf("[FAIL]\tChild died\n");
+			nerrs++;
+		} else if (WEXITSTATUS(status) != 0) {
+			printf("[FAIL]\tChild failed\n");
+			nerrs++;
+		} else {
+			printf("[OK]\tChild succeeded\n");
+		}
+
+		return false;
+	} else {
+		err(1, "fork");
+	}
+}
+
+static void exec_other_validate_cap(const char *name,
+				    bool eff, bool perm, bool inh, bool ambient)
+{
+	execl(name, name, (eff ? "1" : "0"),
+	      (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"),
+	      NULL);
+	err(1, "execl");
+}
+
+static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
+{
+	exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient);
+}
+
+static int do_tests(int uid, const char *our_path)
+{
+	bool have_outer_privilege = create_and_enter_ns(uid);
+
+	int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
+	if (ourpath_fd == -1)
+		err(1, "open '%s'", our_path);
+
+	chdir_to_tmpfs();
+
+	copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap");
+
+	if (have_outer_privilege) {
+		uid_t gid = getegid();
+
+		copy_fromat_to(ourpath_fd, "validate_cap",
+			       "validate_cap_suidroot");
+		if (chown("validate_cap_suidroot", 0, -1) != 0)
+			err(1, "chown");
+		if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0)
+			err(1, "chmod");
+
+		copy_fromat_to(ourpath_fd, "validate_cap",
+			       "validate_cap_suidnonroot");
+		if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0)
+			err(1, "chown");
+		if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0)
+			err(1, "chmod");
+
+		copy_fromat_to(ourpath_fd, "validate_cap",
+			       "validate_cap_sgidroot");
+		if (chown("validate_cap_sgidroot", -1, 0) != 0)
+			err(1, "chown");
+		if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0)
+			err(1, "chmod");
+
+		copy_fromat_to(ourpath_fd, "validate_cap",
+			       "validate_cap_sgidnonroot");
+		if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0)
+			err(1, "chown");
+		if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0)
+			err(1, "chmod");
+}
+
+	capng_get_caps_process();
+
+	/* Make sure that i starts out clear */
+	capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+	if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+		err(1, "capng_apply");
+
+	if (uid == 0) {
+		printf("[RUN]\tRoot => ep\n");
+		if (fork_wait())
+			exec_validate_cap(true, true, false, false);
+	} else {
+		printf("[RUN]\tNon-root => no caps\n");
+		if (fork_wait())
+			exec_validate_cap(false, false, false, false);
+	}
+
+	printf("[OK]\tCheck cap_ambient manipulation rules\n");
+
+	/* We should not be able to add ambient caps yet. */
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) {
+		if (errno == EINVAL)
+			printf("[FAIL]\tPR_CAP_AMBIENT_RAISE isn't supported\n");
+		else
+			printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n");
+		return 1;
+	}
+	printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n");
+
+	capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW);
+	capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW);
+	capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW);
+	if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+		err(1, "capng_apply");
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) {
+		printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n");
+		return 1;
+	}
+	printf("[OK]\tPR_CAP_AMBIENT_RAISE failed on non-permitted cap\n");
+
+	capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+	if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+		err(1, "capng_apply");
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+		printf("[FAIL]\tPR_CAP_AMBIENT_RAISE should have succeeded\n");
+		return 1;
+	}
+	printf("[OK]\tPR_CAP_AMBIENT_RAISE worked\n");
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) {
+		printf("[FAIL]\tPR_CAP_AMBIENT_IS_SET is broken\n");
+		return 1;
+	}
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0)
+		err(1, "PR_CAP_AMBIENT_CLEAR_ALL");
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+		printf("[FAIL]\tPR_CAP_AMBIENT_CLEAR_ALL didn't work\n");
+		return 1;
+	}
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+		err(1, "PR_CAP_AMBIENT_RAISE");
+
+	capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+	if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+		err(1, "capng_apply");
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+		printf("[FAIL]\tDropping I should have dropped A\n");
+		return 1;
+	}
+
+	printf("[OK]\tBasic manipulation appears to work\n");
+
+	capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+	if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+		err(1, "capng_apply");
+	if (uid == 0) {
+		printf("[RUN]\tRoot +i => eip\n");
+		if (fork_wait())
+			exec_validate_cap(true, true, true, false);
+	} else {
+		printf("[RUN]\tNon-root +i => i\n");
+		if (fork_wait())
+			exec_validate_cap(false, false, true, false);
+	}
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+		err(1, "PR_CAP_AMBIENT_RAISE");
+
+	printf("[RUN]\tUID %d +ia => eipa\n", uid);
+	if (fork_wait())
+		exec_validate_cap(true, true, true, true);
+
+	/* The remaining tests need real privilege */
+
+	if (!have_outer_privilege) {
+		printf("[SKIP]\tSUID/SGID tests (needs privilege)\n");
+		goto done;
+	}
+
+	if (uid == 0) {
+		printf("[RUN]\tRoot +ia, suidroot => eipa\n");
+		if (fork_wait())
+			exec_other_validate_cap("./validate_cap_suidroot",
+						true, true, true, true);
+
+		printf("[RUN]\tRoot +ia, suidnonroot => ip\n");
+		if (fork_wait())
+			exec_other_validate_cap("./validate_cap_suidnonroot",
+						false, true, true, false);
+
+		printf("[RUN]\tRoot +ia, sgidroot => eipa\n");
+		if (fork_wait())
+			exec_other_validate_cap("./validate_cap_sgidroot",
+						true, true, true, true);
+
+		if (fork_wait()) {
+			printf("[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n");
+			if (setresgid(1, 1, 1) != 0)
+				err(1, "setresgid");
+			exec_other_validate_cap("./validate_cap_sgidroot",
+						true, true, true, false);
+		}
+
+		printf("[RUN]\tRoot +ia, sgidnonroot => eip\n");
+		if (fork_wait())
+			exec_other_validate_cap("./validate_cap_sgidnonroot",
+						true, true, true, false);
+	} else {
+		printf("[RUN]\tNon-root +ia, sgidnonroot => i\n");
+		exec_other_validate_cap("./validate_cap_sgidnonroot",
+						false, false, true, false);
+
+		if (fork_wait()) {
+			printf("[RUN]\tNon-root +ia, sgidroot => i\n");
+			if (setresgid(1, 1, 1) != 0)
+				err(1, "setresgid");
+			exec_other_validate_cap("./validate_cap_sgidroot",
+						false, false, true, false);
+		}
+	}
+
+done:
+	return nerrs ? 1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+	char *tmp1, *tmp2, *our_path;
+
+	/* Find our path */
+	tmp1 = strdup(argv[0]);
+	if (!tmp1)
+		err(1, "strdup");
+	tmp2 = dirname(tmp1);
+	our_path = strdup(tmp2);
+	if (!our_path)
+		err(1, "strdup");
+	free(tmp1);
+
+	if (fork_wait()) {
+		printf("[RUN]\t+++ Tests with uid == 0 +++\n");
+		return do_tests(0, our_path);
+	}
+
+	if (fork_wait()) {
+		printf("[RUN]\t+++ Tests with uid != 0 +++\n");
+		return do_tests(1, our_path);
+	}
+
+	return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
new file mode 100644
index 000000000000..dd3c45f7b23c
--- /dev/null
+++ b/tools/testing/selftests/capabilities/validate_cap.c
@@ -0,0 +1,73 @@
+#include <cap-ng.h>
+#include <err.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/prctl.h>
+#include <sys/auxv.h>
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT			47
+# define PR_CAP_AMBIENT_IS_SET		1
+# define PR_CAP_AMBIENT_RAISE		2
+# define PR_CAP_AMBIENT_LOWER		3
+# define PR_CAP_AMBIENT_CLEAR_ALL	4
+#endif
+
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)
+# define HAVE_GETAUXVAL
+#endif
+
+static bool bool_arg(char **argv, int i)
+{
+	if (!strcmp(argv[i], "0"))
+		return false;
+	else if (!strcmp(argv[i], "1"))
+		return true;
+	else
+		errx(1, "wrong argv[%d]", i);
+}
+
+int main(int argc, char **argv)
+{
+	const char *atsec = "";
+
+	/*
+	 * Be careful just in case a setgid or setcapped copy of this
+	 * helper gets out.
+	 */
+
+	if (argc != 5)
+		errx(1, "wrong argc");
+
+#ifdef HAVE_GETAUXVAL
+	if (getauxval(AT_SECURE))
+		atsec = " (AT_SECURE is set)";
+	else
+		atsec = " (AT_SECURE is not set)";
+#endif
+
+	capng_get_caps_process();
+
+	if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
+		printf("[FAIL]\tWrong effective state%s\n", atsec);
+		return 1;
+	}
+	if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) {
+		printf("[FAIL]\tWrong permitted state%s\n", atsec);
+		return 1;
+	}
+	if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) {
+		printf("[FAIL]\tWrong inheritable state%s\n", atsec);
+		return 1;
+	}
+
+	if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) {
+		printf("[FAIL]\tWrong ambient state%s\n", atsec);
+		return 1;
+	}
+
+	printf("[OK]\tCapabilities after execve were correct\n");
+	return 0;
+}
-- 
cgit v1.2.3


From 746bf6d64275be0c65b0631d8a72b16f1454cfa1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 4 Sep 2015 15:42:51 -0700
Subject: capabilities: add a securebit to disable PR_CAP_AMBIENT_RAISE

Per Andrew Morgan's request, add a securebit to allow admins to disable
PR_CAP_AMBIENT_RAISE.  This securebit will prevent processes from adding
capabilities to their ambient set.

For simplicity, this disables PR_CAP_AMBIENT_RAISE entirely rather than
just disabling setting previously cleared bits.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Andrew G. Morgan <morgan@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Aaron Jones <aaronmdjones@gmail.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Andrew G. Morgan <morgan@kernel.org>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Markku Savela <msa@moth.iki.fi>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: James Morris <james.l.morris@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/securebits.h | 11 ++++++++++-
 security/commoncap.c            |  3 ++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/securebits.h b/include/uapi/linux/securebits.h
index 985aac9e6bf8..35ac35cef217 100644
--- a/include/uapi/linux/securebits.h
+++ b/include/uapi/linux/securebits.h
@@ -43,9 +43,18 @@
 #define SECBIT_KEEP_CAPS	(issecure_mask(SECURE_KEEP_CAPS))
 #define SECBIT_KEEP_CAPS_LOCKED (issecure_mask(SECURE_KEEP_CAPS_LOCKED))
 
+/* When set, a process cannot add new capabilities to its ambient set. */
+#define SECURE_NO_CAP_AMBIENT_RAISE		6
+#define SECURE_NO_CAP_AMBIENT_RAISE_LOCKED	7  /* make bit-6 immutable */
+
+#define SECBIT_NO_CAP_AMBIENT_RAISE (issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
+#define SECBIT_NO_CAP_AMBIENT_RAISE_LOCKED \
+			(issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE_LOCKED))
+
 #define SECURE_ALL_BITS		(issecure_mask(SECURE_NOROOT) | \
 				 issecure_mask(SECURE_NO_SETUID_FIXUP) | \
-				 issecure_mask(SECURE_KEEP_CAPS))
+				 issecure_mask(SECURE_KEEP_CAPS) | \
+				 issecure_mask(SECURE_NO_CAP_AMBIENT_RAISE))
 #define SECURE_ALL_LOCKS	(SECURE_ALL_BITS << 1)
 
 #endif /* _UAPI_LINUX_SECUREBITS_H */
diff --git a/security/commoncap.c b/security/commoncap.c
index 1f74dde1063e..1832cf701c3d 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -993,7 +993,8 @@ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
 			if (arg2 == PR_CAP_AMBIENT_RAISE &&
 			    (!cap_raised(current_cred()->cap_permitted, arg3) ||
 			     !cap_raised(current_cred()->cap_inheritable,
-					 arg3)))
+					 arg3) ||
+			     issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
 				return -EPERM;
 
 			new = prepare_creds();
-- 
cgit v1.2.3


From 031e29b5877f31676739dc2f847d04c2c0732034 Mon Sep 17 00:00:00 2001
From: Yuriy Kolerov <yuriy.kolerov@synopsys.com>
Date: Fri, 4 Sep 2015 15:42:58 -0700
Subject: drivers/video/concole: add negative dependency for VGA_CONSOLE on ARC

Architectures which support VGA console must define screen_info
structurture from "uapi/linux/screen_info.h".  Otherwise undefined
symbol error occurs.  Usually it's defined in "setup.c" for each
architecture.

If an architecture does not support VGA console (ARC's case) there are 2
ways: define a dummy instance of screen_info or add a negative
dependency for VGA_CONSOLE in to prevent selecting this option.

I've implemented the second way.  However the best solution is to add
HAVE_VGA_CONSOLE option for targets which support VGA console.  Then
turn off VGA_CONSOLE by default and add dependency to HAVE_VGA_CONSOLE.
But right now it's better to just add a negative dependency for ARC and
then consider how to collaborate about this issue with maintainers of
other architectures.

Signed-off-by: Yuriy Kolerov <yuriy.kolerov@synopsys.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Tomi Valkeinen <tomi.valkeinen@ti.com>
Cc: Jaya Kumar <jayalk@intworks.biz>
Cc: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/console/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig
index ba97efc3bf70..071280643db7 100644
--- a/drivers/video/console/Kconfig
+++ b/drivers/video/console/Kconfig
@@ -9,7 +9,7 @@ config VGA_CONSOLE
 	depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \
 		!SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \
 		(!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \
-		!ARM64
+		!ARM64 && !ARC
 	default y
 	help
 	  Saying Y here will allow you to use Linux in text mode through a
-- 
cgit v1.2.3


From 7c49b8616460ebb12ee56d80d1abfbc20b6f3cbb Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 4 Sep 2015 15:43:01 -0700
Subject: fs/notify: optimize inotify/fsnotify code for unwatched files

I have a _tiny_ microbenchmark that sits in a loop and writes single
bytes to a file.  Writing one byte to a tmpfs file is around 2x slower
than reading one byte from a file, which is a _bit_ more than I expecte.
This is a dumb benchmark, but I think it's hard to deny that write() is
a hot path and we should avoid unnecessary overhead there.

I did a 'perf record' of 30-second samples of read and write.  The top
item in a diffprofile is srcu_read_lock() from fsnotify().  There are
active inotify fd's from systemd, but nothing is actually listening to
the file or its part of the filesystem.

I *think* we can avoid taking the srcu_read_lock() for the common case
where there are no actual marks on the file.  This means that there will
both be nothing to notify for *and* implies that there is no need for
clearing the ignore mask.

This patch gave a 13.1% speedup in writes/second on my test, which is an
improvement from the 10.8% that I saw with the last version.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Paris <eparis@redhat.com>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Robert Love <rlove@rlove.org>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fsnotify.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index dd3fb0b17be7..d675e76251d3 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -204,6 +204,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	else
 		mnt = NULL;
 
+	/*
+	 * Optimization: srcu_read_lock() has a memory barrier which can
+	 * be expensive.  It protects walking the *_fsnotify_marks lists.
+	 * However, if we do not walk the lists, we do not have to do
+	 * SRCU because we have no references to any objects and do not
+	 * need SRCU to keep them "alive".
+	 */
+	if (hlist_empty(&to_tell->i_fsnotify_marks) &&
+	    (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+		return 0;
 	/*
 	 * if this is a modify event we may need to clear the ignored masks
 	 * otherwise return if neither the inode nor the vfsmount care about
-- 
cgit v1.2.3


From 3c53e514212455db9923c203694a72007558b48f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 4 Sep 2015 15:43:03 -0700
Subject: fsnotify: fix check in inotify fdinfo printing

A check in inotify_fdinfo() checking whether mark is valid was always
true due to a bug.  Luckily we can never get to invalidated marks since
we hold mark_mutex and invalidated marks get removed from the group list
when they are invalidated under that mutex.

Anyway fix the check to make code more future proof.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fdinfo.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 58b7cdb63da9..6b6f0d472ae8 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,7 +76,8 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
 
-	if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+	    !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
 		return;
 
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
-- 
cgit v1.2.3


From 1e39fc01836d02a11515aaabd97a0a938326bfe2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 4 Sep 2015 15:43:06 -0700
Subject: fsnotify: document mark locking

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify_backend.h | 50 ++++++++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 65a517dd32f7..dd6ddb0287ed 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -195,40 +195,50 @@ struct fsnotify_group {
 #define FSNOTIFY_EVENT_INODE	2
 
 /*
- * a mark is simply an object attached to an in core inode which allows an
+ * A mark is simply an object attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
  * of a type matching mask or only interested in those events.
  *
- * these are flushed when an inode is evicted from core and may be flushed
- * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
- * (such as dnotify) will flush these when the open fd is closed and not at
- * inode eviction or modification.
+ * These are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
+ * users (such as dnotify) will flush these when the open fd is closed and not
+ * at inode eviction or modification.
+ *
+ * Text in brackets is showing the lock(s) protecting modifications of a
+ * particular entry. obj_lock means either inode->i_lock or
+ * mnt->mnt_root->d_lock depending on the mark type.
  */
 struct fsnotify_mark {
-	__u32 mask;			/* mask this mark is for */
-	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+	/* Mask this mark is for [mark->lock, group->mark_mutex] */
+	__u32 mask;
+	/* We hold one for presence in g_list. Also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
-	atomic_t refcnt;		/* active things looking at this mark */
-	struct fsnotify_group *group;	/* group this mark is for */
-	struct list_head g_list;	/* list of marks by group->i_fsnotify_marks
-					 * Also reused for queueing mark into
-					 * destroy_list when it's waiting for
-					 * the end of SRCU period before it can
-					 * be freed */
-	spinlock_t lock;		/* protect group and inode */
-	struct hlist_node obj_list;	/* list of marks for inode / vfsmount */
-	struct list_head free_list;	/* tmp list used when freeing this mark */
-	union {
+	atomic_t refcnt;
+	/* Group this mark is for. Set on mark creation, stable until last ref
+	 * is dropped */
+	struct fsnotify_group *group;
+	/* List of marks by group->i_fsnotify_marks. Also reused for queueing
+	 * mark into destroy_list when it's waiting for the end of SRCU period
+	 * before it can be freed. [group->mark_mutex] */
+	struct list_head g_list;
+	/* Protects inode / mnt pointers, flags, masks */
+	spinlock_t lock;
+	/* List of marks for inode / vfsmount [obj_lock] */
+	struct hlist_node obj_list;
+	/* tmp list used when freeing this mark */
+	struct list_head free_list;
+	union {	/* Object pointer [mark->lock, group->mark_mutex] */
 		struct inode *inode;	/* inode this mark is associated with */
 		struct vfsmount *mnt;	/* vfsmount this mark is associated with */
 	};
-	__u32 ignored_mask;		/* events types to ignore */
+	/* Events types to ignore [mark->lock, group->mark_mutex] */
+	__u32 ignored_mask;
 #define FSNOTIFY_MARK_FLAG_INODE		0x01
 #define FSNOTIFY_MARK_FLAG_VFSMOUNT		0x02
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x10
-	unsigned int flags;		/* vfsmount or inode mark? */
+	unsigned int flags;		/* flags [mark->lock] */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
 
-- 
cgit v1.2.3


From 925d1132a03e33cb8f29a0057300d023b4f1be23 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 4 Sep 2015 15:43:09 -0700
Subject: fsnotify: remove mark->free_list

Free list is used when all marks on given inode / mount should be
destroyed when inode / mount is going away.  However we can free all of
the marks without using a special list with some care.

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fsnotify.c             |  1 -
 fs/notify/fsnotify.h             | 21 +++++++++++++++------
 fs/notify/inode_mark.c           | 20 --------------------
 fs/notify/mark.c                 | 40 +++++++++++++++++++++++++---------------
 fs/notify/vfsmount_mark.c        | 19 -------------------
 include/linux/fsnotify_backend.h |  2 --
 6 files changed, 40 insertions(+), 63 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d675e76251d3..db39de2dd4cb 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -26,7 +26,6 @@
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
 
 /*
  * Clear all of the marks on an inode when it is being evicted from core
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 13a00be516d2..b44c68a857e7 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -6,6 +6,8 @@
 #include <linux/srcu.h>
 #include <linux/types.h>
 
+#include "../mount.h"
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
@@ -38,15 +40,22 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
 extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
 /* inode specific destruction of a mark */
 extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Destroy all marks in the given list */
-extern void fsnotify_destroy_marks(struct list_head *to_free);
 /* Find mark belonging to given group in the list of marks */
 extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
 						struct fsnotify_group *group);
-/* run the list of all marks associated with inode and flag them to be freed */
-extern void fsnotify_clear_marks_by_inode(struct inode *inode);
-/* run the list of all marks associated with vfsmount and flag them to be freed */
-extern void fsnotify_clear_marks_by_mount(struct vfsmount *mnt);
+/* Destroy all marks in the given list protected by 'lock' */
+extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* run the list of all marks associated with inode and destroy them */
+static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+}
+/* run the list of all marks associated with vfsmount and destroy them */
+static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
+{
+	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
+			       &mnt->mnt_root->d_lock);
+}
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
  * about events that happen to its children.
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 3daf513ee99e..474a3ce1b5e1 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -64,26 +64,6 @@ void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
 	spin_unlock(&inode->i_lock);
 }
 
-/*
- * Given an inode, destroy all of the marks associated with that inode.
- */
-void fsnotify_clear_marks_by_inode(struct inode *inode)
-{
-	struct fsnotify_mark *mark;
-	struct hlist_node *n;
-	LIST_HEAD(free_list);
-
-	spin_lock(&inode->i_lock);
-	hlist_for_each_entry_safe(mark, n, &inode->i_fsnotify_marks, obj_list) {
-		list_add(&mark->free_list, &free_list);
-		hlist_del_init_rcu(&mark->obj_list);
-		fsnotify_get_mark(mark);
-	}
-	spin_unlock(&inode->i_lock);
-
-	fsnotify_destroy_marks(&free_list);
-}
-
 /*
  * Given a group clear all of the inode marks associated with that group.
  */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 39ddcaf0918f..3b2d1ba41e7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -203,24 +203,34 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	mutex_unlock(&group->mark_mutex);
 }
 
-/*
- * Destroy all marks in the given list. The marks must be already detached from
- * the original inode / vfsmount.
- */
-void fsnotify_destroy_marks(struct list_head *to_free)
+void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 {
-	struct fsnotify_mark *mark, *lmark;
-	struct fsnotify_group *group;
-
-	list_for_each_entry_safe(mark, lmark, to_free, free_list) {
-		spin_lock(&mark->lock);
-		fsnotify_get_group(mark->group);
-		group = mark->group;
-		spin_unlock(&mark->lock);
+	struct fsnotify_mark *mark;
 
-		fsnotify_destroy_mark(mark, group);
+	while (1) {
+		/*
+		 * We have to be careful since we can race with e.g.
+		 * fsnotify_clear_marks_by_group() and once we drop 'lock',
+		 * mark can get removed from the obj_list and destroyed. But
+		 * we are holding mark reference so mark cannot be freed and
+		 * calling fsnotify_destroy_mark() more than once is fine.
+		 */
+		spin_lock(lock);
+		if (hlist_empty(head)) {
+			spin_unlock(lock);
+			break;
+		}
+		mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
+		/*
+		 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
+		 * since inode / mount is going away anyway. So just remove
+		 * mark from the list.
+		 */
+		hlist_del_init_rcu(&mark->obj_list);
+		fsnotify_get_mark(mark);
+		spin_unlock(lock);
+		fsnotify_destroy_mark(mark, mark->group);
 		fsnotify_put_mark(mark);
-		fsnotify_put_group(group);
 	}
 }
 
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 326b148e623c..a8fcab68faef 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -28,25 +28,6 @@
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
-#include "../mount.h"
-
-void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
-{
-	struct fsnotify_mark *mark;
-	struct hlist_node *n;
-	struct mount *m = real_mount(mnt);
-	LIST_HEAD(free_list);
-
-	spin_lock(&mnt->mnt_root->d_lock);
-	hlist_for_each_entry_safe(mark, n, &m->mnt_fsnotify_marks, obj_list) {
-		list_add(&mark->free_list, &free_list);
-		hlist_del_init_rcu(&mark->obj_list);
-		fsnotify_get_mark(mark);
-	}
-	spin_unlock(&mnt->mnt_root->d_lock);
-
-	fsnotify_destroy_marks(&free_list);
-}
 
 void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index dd6ddb0287ed..f044fe30e8c3 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -225,8 +225,6 @@ struct fsnotify_mark {
 	spinlock_t lock;
 	/* List of marks for inode / vfsmount [obj_lock] */
 	struct hlist_node obj_list;
-	/* tmp list used when freeing this mark */
-	struct list_head free_list;
 	union {	/* Object pointer [mark->lock, group->mark_mutex] */
 		struct inode *inode;	/* inode this mark is associated with */
 		struct vfsmount *mnt;	/* vfsmount this mark is associated with */
-- 
cgit v1.2.3


From 4712e722f91457e60723b9cef6265a74290efba9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.com>
Date: Fri, 4 Sep 2015 15:43:12 -0700
Subject: fsnotify: get rid of fsnotify_destroy_mark_locked()

fsnotify_destroy_mark_locked() is subtle to use because it temporarily
releases group->mark_mutex.  To avoid future problems with this
function, split it into two.

fsnotify_detach_mark() is the part that needs group->mark_mutex and
fsnotify_free_mark() is the part that must be called outside of
group->mark_mutex.  This way it's much clearer what's going on and we
also avoid some pointless acquisitions of group->mark_mutex.

Signed-off-by: Jan Kara <jack@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/dnotify/dnotify.c        | 14 +++++---
 fs/notify/fanotify/fanotify_user.c |  8 +++--
 fs/notify/mark.c                   | 73 +++++++++++++++++++++-----------------
 include/linux/fsnotify_backend.h   |  7 ++--
 4 files changed, 61 insertions(+), 41 deletions(-)

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 44523f4a6084..6faaf710e563 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,6 +154,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
+	bool free = false;
 
 	inode = file_inode(filp);
 	if (!S_ISDIR(inode->i_mode))
@@ -182,11 +183,15 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 
 	/* nothing else could have found us thanks to the dnotify_groups
 	   mark_mutex */
-	if (dn_mark->dn == NULL)
-		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
+	if (dn_mark->dn == NULL) {
+		fsnotify_detach_mark(fsn_mark);
+		free = true;
+	}
 
 	mutex_unlock(&dnotify_group->mark_mutex);
 
+	if (free)
+		fsnotify_free_mark(fsn_mark);
 	fsnotify_put_mark(fsn_mark);
 }
 
@@ -362,9 +367,10 @@ out:
 	spin_unlock(&fsn_mark->lock);
 
 	if (destroy)
-		fsnotify_destroy_mark_locked(fsn_mark, dnotify_group);
-
+		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&dnotify_group->mark_mutex);
+	if (destroy)
+		fsnotify_free_mark(fsn_mark);
 	fsnotify_put_mark(fsn_mark);
 out_err:
 	if (new_fsn_mark)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index cf275500a665..8e8e6bcd1d43 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -529,8 +529,10 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark_locked(fsn_mark, group);
+		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
+	if (destroy_mark)
+		fsnotify_free_mark(fsn_mark);
 
 	fsnotify_put_mark(fsn_mark);
 	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
@@ -557,8 +559,10 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
 	if (destroy_mark)
-		fsnotify_destroy_mark_locked(fsn_mark, group);
+		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
+	if (destroy_mark)
+		fsnotify_free_mark(fsn_mark);
 
 	/* matches the fsnotify_find_inode_mark() */
 	fsnotify_put_mark(fsn_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 3b2d1ba41e7b..fc0df4442f7b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -122,26 +122,27 @@ u32 fsnotify_recalc_mask(struct hlist_head *head)
 }
 
 /*
- * Any time a mark is getting freed we end up here.
- * The caller had better be holding a reference to this mark so we don't actually
- * do the final put under the mark->lock
+ * Remove mark from inode / vfsmount list, group list, drop inode reference
+ * if we got one.
+ *
+ * Must be called with group->mark_mutex held.
  */
-void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
-				  struct fsnotify_group *group)
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
 {
 	struct inode *inode = NULL;
+	struct fsnotify_group *group = mark->group;
 
 	BUG_ON(!mutex_is_locked(&group->mark_mutex));
 
 	spin_lock(&mark->lock);
 
 	/* something else already called this function on this mark */
-	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 		spin_unlock(&mark->lock);
 		return;
 	}
 
-	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
 
 	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
 		inode = mark->inode;
@@ -150,6 +151,12 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 		fsnotify_destroy_vfsmount_mark(mark);
 	else
 		BUG();
+	/*
+	 * Note that we didn't update flags telling whether inode cares about
+	 * what's happening with children. We update these flags from
+	 * __fsnotify_parent() lazily when next event happens on one of our
+	 * children.
+	 */
 
 	list_del_init(&mark->g_list);
 
@@ -157,18 +164,32 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 
 	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
 		iput(inode);
-	/* release lock temporarily */
-	mutex_unlock(&group->mark_mutex);
+
+	atomic_dec(&group->num_marks);
+}
+
+/*
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_group *group = mark->group;
+
+	spin_lock(&mark->lock);
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	spin_unlock(&mark->lock);
 
 	spin_lock(&destroy_lock);
 	list_add(&mark->g_list, &destroy_list);
 	spin_unlock(&destroy_lock);
 	wake_up(&destroy_waitq);
-	/*
-	 * We don't necessarily have a ref on mark from caller so the above destroy
-	 * may have actually freed it, unless this group provides a 'freeing_mark'
-	 * function which must be holding a reference.
-	 */
 
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
@@ -177,30 +198,15 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
 	 */
 	if (group->ops->freeing_mark)
 		group->ops->freeing_mark(mark, group);
-
-	/*
-	 * __fsnotify_update_child_dentry_flags(inode);
-	 *
-	 * I really want to call that, but we can't, we have no idea if the inode
-	 * still exists the second we drop the mark->lock.
-	 *
-	 * The next time an event arrive to this inode from one of it's children
-	 * __fsnotify_parent will see that the inode doesn't care about it's
-	 * children and will update all of these flags then.  So really this
-	 * is just a lazy update (and could be a perf win...)
-	 */
-
-	atomic_dec(&group->num_marks);
-
-	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 			   struct fsnotify_group *group)
 {
 	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-	fsnotify_destroy_mark_locked(mark, group);
+	fsnotify_detach_mark(mark);
 	mutex_unlock(&group->mark_mutex);
+	fsnotify_free_mark(mark);
 }
 
 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
@@ -342,7 +348,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	 * inode->i_lock
 	 */
 	spin_lock(&mark->lock);
-	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE;
+	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
 
 	fsnotify_get_group(group);
 	mark->group = group;
@@ -448,8 +454,9 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 		}
 		mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
 		fsnotify_get_mark(mark);
-		fsnotify_destroy_mark_locked(mark, group);
+		fsnotify_detach_mark(mark);
 		mutex_unlock(&group->mark_mutex);
+		fsnotify_free_mark(mark);
 		fsnotify_put_mark(mark);
 	}
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index f044fe30e8c3..e0727d77feaf 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -236,6 +236,7 @@ struct fsnotify_mark {
 #define FSNOTIFY_MARK_FLAG_OBJECT_PINNED	0x04
 #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY	0x08
 #define FSNOTIFY_MARK_FLAG_ALIVE		0x10
+#define FSNOTIFY_MARK_FLAG_ATTACHED		0x20
 	unsigned int flags;		/* flags [mark->lock] */
 	void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */
 };
@@ -353,8 +354,10 @@ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 				  struct fsnotify_group *group);
-extern void fsnotify_destroy_mark_locked(struct fsnotify_mark *mark,
-					 struct fsnotify_group *group);
+/* detach mark from inode / mount list, group list, drop inode reference */
+extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+/* free mark */
+extern void fsnotify_free_mark(struct fsnotify_mark *mark);
 /* run all the marks in a group, and clear all of the vfsmount marks */
 extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the inode marks */
-- 
cgit v1.2.3


From 779a6ce877bf711323f998b3a7382cdbe7350d87 Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Fri, 4 Sep 2015 15:43:15 -0700
Subject: scripts/spelling.txt: add misspelled words for check

misspelled words for check:-
 chcek
 chck
 cehck

I myself did these spell mistakes in changelog for patches, Thus
suggesting to add in spelling.txt, so that checkpatch.pl warns it
earlier.  References:-

./arch/powerpc/kernel/exceptions-64e.S:456: . . . make sure you chcek
https://lkml.org/lkml/2015/6/25/289
./arch/x86/mm/pageattr.c:1368: * No need to cehck in that case

[akpm@linux-foundation.org: add whcih->which, whcih I always get wrong]
Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index bb8e4d0a1911..4bd8d1a3415f 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -187,6 +187,7 @@ capatibilities||capabilities
 carefuly||carefully
 cariage||carriage
 catagory||category
+cehck||check
 challange||challenge
 challanges||challenges
 chanell||channel
@@ -199,6 +200,8 @@ charactor||character
 charater||character
 charaters||characters
 charcter||character
+chcek||check
+chck||check
 checksuming||checksumming
 childern||children
 childs||children
@@ -1028,6 +1031,7 @@ visiters||visitors
 vitual||virtual
 wating||waiting
 whataver||whatever
+whcih||which
 whenver||whenever
 wheter||whether
 whe||when
-- 
cgit v1.2.3


From c22b6ae69ee93c1ecc6821847a8542163fbf3e1a Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Fri, 4 Sep 2015 15:43:18 -0700
Subject: scripts/spelling.txt: spelling of uninitialized

I just did a spelling mistake of uninitialized and wrote that as
unintialized.  Fortunately I noticed it in my final review.

Signed-off-by: Sudip Mukherjee <sudip@vectorindia.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index 4bd8d1a3415f..bf30d2c0ec27 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -990,6 +990,7 @@ unexpectd||unexpected
 unexpeted||unexpected
 unfortunatelly||unfortunately
 unifiy||unify
+unintialized||uninitialized
 unknonw||unknown
 unknow||unknown
 unkown||unknown
-- 
cgit v1.2.3


From d40e1e6532efbb40f8fc1f5af093063a3d186754 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Fri, 4 Sep 2015 15:43:21 -0700
Subject: kerneldoc: Convert error messages to GNU error message format

Editors like emacs and vi recognize a number of error message formats.
The format used by the kerneldoc tool is not recognized by emacs.

Change the kerneldoc error message format to the GNU style such that the
emacs prev-error and next-error commands can be used to navigate through
kerneldoc error messages.  For more information about the GNU error
message format, see also
  https://www.gnu.org/prep/standards/html_node/Errors.html.

This patch has been generated via the following sed command:

  sed -i.orig 's/Error(\${file}:\$.):/\${file}:\$.: error:/g;s/Warning(\${file}:\$.):/\${file}:\$.: warning:/g;s/Warning(\${file}):/\${file}:1: warning:/g;s/Info(\${file}:\$.):/\${file}:\$.: info:/g' scripts/kernel-doc

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Johannes Berg <johannes.berg@intel.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/kernel-doc | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index a7bf5f68aacb..9a08fb5c1af6 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -469,7 +469,7 @@ sub dump_section {
     } else {
 #	print STDERR "other section '$name' = '$contents'\n";
 	if (defined($sections{$name}) && ($sections{$name} ne "")) {
-		print STDERR "Error(${file}:$.): duplicate section name '$name'\n";
+		print STDERR "${file}:$.: error: duplicate section name '$name'\n";
 		++$errors;
 	}
 	$sections{$name} = $contents;
@@ -1820,7 +1820,7 @@ sub dump_struct($$) {
 			   });
     }
     else {
-	print STDERR "Error(${file}:$.): Cannot parse struct or union!\n";
+	print STDERR "${file}:$.: error: Cannot parse struct or union!\n";
 	++$errors;
     }
 }
@@ -1841,7 +1841,7 @@ sub dump_enum($$) {
 	    push @parameterlist, $arg;
 	    if (!$parameterdescs{$arg}) {
 		$parameterdescs{$arg} = $undescribed;
-		print STDERR "Warning(${file}:$.): Enum value '$arg' ".
+		print STDERR "${file}:$.: warning: Enum value '$arg' ".
 		    "not described in enum '$declaration_name'\n";
 	    }
 
@@ -1859,7 +1859,7 @@ sub dump_enum($$) {
 			   });
     }
     else {
-	print STDERR "Error(${file}:$.): Cannot parse enum!\n";
+	print STDERR "${file}:$.: error: Cannot parse enum!\n";
 	++$errors;
     }
 }
@@ -1887,7 +1887,7 @@ sub dump_typedef($$) {
 			   });
     }
     else {
-	print STDERR "Error(${file}:$.): Cannot parse typedef!\n";
+	print STDERR "${file}:$.: error: Cannot parse typedef!\n";
 	++$errors;
     }
 }
@@ -2019,11 +2019,11 @@ sub push_parameter($$$) {
 	    $parameterdescs{$param_name} = $undescribed;
 
 	    if (($type eq 'function') || ($type eq 'enum')) {
-		print STDERR "Warning(${file}:$.): Function parameter ".
+		print STDERR "${file}:$.: warning: Function parameter ".
 		    "or member '$param' not " .
 		    "described in '$declaration_name'\n";
 	    }
-	    print STDERR "Warning(${file}:$.):" .
+	    print STDERR "${file}:$.: warning:" .
 			 " No description found for parameter '$param'\n";
 	    ++$warnings;
 	}
@@ -2074,14 +2074,14 @@ sub check_sections($$$$$$) {
 		}
 		if ($err) {
 			if ($decl_type eq "function") {
-				print STDERR "Warning(${file}:$.): " .
+				print STDERR "${file}:$.: warning: " .
 					"Excess function parameter " .
 					"'$sects[$sx]' " .
 					"description in '$decl_name'\n";
 				++$warnings;
 			} else {
 				if ($nested !~ m/\Q$sects[$sx]\E/) {
-				    print STDERR "Warning(${file}:$.): " .
+				    print STDERR "${file}:$.: warning: " .
 					"Excess struct/union/enum/typedef member " .
 					"'$sects[$sx]' " .
 					"description in '$decl_name'\n";
@@ -2107,7 +2107,7 @@ sub check_return_section {
 
         if (!defined($sections{$section_return}) ||
             $sections{$section_return} eq "") {
-                print STDERR "Warning(${file}:$.): " .
+                print STDERR "${file}:$.: warning: " .
                         "No description found for return value of " .
                         "'$declaration_name'\n";
                 ++$warnings;
@@ -2186,7 +2186,7 @@ sub dump_function($$) {
 
 	create_parameterlist($args, ',', $file);
     } else {
-	print STDERR "Warning(${file}:$.): cannot understand function prototype: '$prototype'\n";
+	print STDERR "${file}:$.: warning: cannot understand function prototype: '$prototype'\n";
 	return;
     }
 
@@ -2251,7 +2251,7 @@ sub tracepoint_munge($) {
 		$tracepointargs = $1;
 	}
 	if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
-		print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+		print STDERR "${file}:$.: warning: Unrecognized tracepoint format: \n".
 			     "$prototype\n";
 	} else {
 		$prototype = "static inline void trace_$tracepointname($tracepointargs)";
@@ -2450,7 +2450,7 @@ sub process_file($) {
 		}
 
 		if (($declaration_purpose eq "") && $verbose) {
-			print STDERR "Warning(${file}:$.): missing initial short description on line:\n";
+			print STDERR "${file}:$.: warning: missing initial short description on line:\n";
 			print STDERR $_;
 			++$warnings;
 		}
@@ -2468,10 +2468,10 @@ sub process_file($) {
 		}
 
 		if ($verbose) {
-		    print STDERR "Info(${file}:$.): Scanning doc for $identifier\n";
+		    print STDERR "${file}:$.: info: Scanning doc for $identifier\n";
 		}
 	    } else {
-		print STDERR "Warning(${file}:$.): Cannot understand $_ on line $.",
+		print STDERR "${file}:$.: warning: Cannot understand $_ on line $.",
 		" - I thought it was a doc line\n";
 		++$warnings;
 		$state = 0;
@@ -2483,7 +2483,7 @@ sub process_file($) {
 
 		if (($contents ne "") && ($contents ne "\n")) {
 		    if (!$in_doc_sect && $verbose) {
-			print STDERR "Warning(${file}:$.): contents before sections\n";
+			print STDERR "${file}:$.: warning: contents before sections\n";
 			++$warnings;
 		    }
 		    dump_section($file, $section, xml_escape($contents));
@@ -2509,7 +2509,7 @@ sub process_file($) {
 		}
 		# look for doc_com + <text> + doc_end:
 		if ($_ =~ m'\s*\*\s*[a-zA-Z_0-9:\.]+\*/') {
-		    print STDERR "Warning(${file}:$.): suspicious ending line: $_";
+		    print STDERR "${file}:$.: warning: suspicious ending line: $_";
 		    ++$warnings;
 		}
 
@@ -2539,7 +2539,7 @@ sub process_file($) {
 		}
 	    } else {
 		# i dont know - bad line?  ignore.
-		print STDERR "Warning(${file}:$.): bad line: $_";
+		print STDERR "${file}:$.: warning: bad line: $_";
 		++$warnings;
 	    }
 	} elsif ($state == 5) { # scanning for split parameters
@@ -2631,7 +2631,7 @@ sub process_file($) {
 	}
     }
     if ($initial_section_counter == $section_counter) {
-	print STDERR "Warning(${file}): no structured comments found\n";
+	print STDERR "${file}:1: warning: no structured comments found\n";
 	if (($function_only == 1) && ($show_not_found == 1)) {
 	    print STDERR "    Was looking for '$_'.\n" for keys %function_table;
 	}
-- 
cgit v1.2.3


From fa70900e0984792cc45a9e51c28684c3287058c2 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Fri, 4 Sep 2015 15:43:24 -0700
Subject: scripts/Lindent: handle missing indent gracefully

If indent is not found, bail out immediately instead of spitting random
shell script error messages.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/Lindent | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/Lindent b/scripts/Lindent
index 9c4b3e2b7098..6d889de4e70b 100755
--- a/scripts/Lindent
+++ b/scripts/Lindent
@@ -1,6 +1,9 @@
 #!/bin/sh
 PARAM="-npro -kr -i8 -ts8 -sob -l80 -ss -ncs -cp1"
 RES=`indent --version`
+if [ "$RES" = "" ]; then
+	exit 1
+fi
 V1=`echo $RES | cut -d' ' -f3 | cut -d'.' -f1`
 V2=`echo $RES | cut -d' ' -f3 | cut -d'.' -f2`
 V3=`echo $RES | cut -d' ' -f3 | cut -d'.' -f3`
-- 
cgit v1.2.3


From e260fe01fa39eddb05bd8b70fad5bc9a129648f2 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Fri, 4 Sep 2015 15:43:26 -0700
Subject: scripts: decode_stacktrace: fix ARM architecture decoding

Fix the stack decoder for the ARM architecture.
An ARM stack is designed as :

[   81.547704] [<c023eb04>] (bucket_find_contain) from [<c023ec88>] (check_sync+0x40/0x4f8)
[   81.559668] [<c023ec88>] (check_sync) from [<c023f8c4>] (debug_dma_sync_sg_for_cpu+0x128/0x194)
[   81.571583] [<c023f8c4>] (debug_dma_sync_sg_for_cpu) from [<c0327dec>] (__videobuf_s

The current script doesn't expect the symbols to be bound by
parenthesis, and triggers the following errors :

  awk: cmd. line:1: error: Unmatched ( or \(: / (check_sync$/
  [   81.547704] (bucket_find_contain) from (check_sync+0x40/0x4f8)

Fix it by chopping starting and ending parenthesis from the each symbol
name.

As a side note, this probably comes from the function
dump_backtrace_entry(), which is implemented differently for each
architecture.  That makes a single decoding script a bit a challenge.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Michal Marek <mmarek@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/decode_stacktrace.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 515c4c00e957..00d6d53c2681 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -14,11 +14,14 @@ declare -A cache
 
 parse_symbol() {
 	# The structure of symbol at this point is:
-	#   [name]+[offset]/[total length]
+	#   ([name]+[offset]/[total length])
 	#
 	# For example:
 	#   do_basic_setup+0x9c/0xbf
 
+	# Remove the englobing parenthesis
+	symbol=${symbol#\(}
+	symbol=${symbol%\)}
 
 	# Strip the symbol name so that we could look it up
 	local name=${symbol%+*}
-- 
cgit v1.2.3


From 35108d71383c5f4abc286430c0b5da79d22553f9 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Fri, 4 Sep 2015 15:43:29 -0700
Subject: scripts/spelling.txt: add some typo-words

I wrote a small script to show word-pair from all linux spelling-typo
commits, and get following result by sort | uniq -c:

    181 occured -> occurred
     78 transfered -> transferred
     67 recieved -> received
     65 dependant -> dependent
     58 wether -> whether
     56 accomodate -> accommodate
     54 occured -> occurred
     51 recieve -> receive
     47 cant -> can't
     40 sucessfully -> successfully
     ...

Some of them are not in spelling.txt, this patch adds the most common
word-pairs into spelling.txt.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index bf30d2c0ec27..946caf3bd694 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -32,6 +32,7 @@ accoring||according
 accout||account
 accquire||acquire
 accquired||acquired
+accross||across
 acessable||accessible
 acess||access
 achitecture||architecture
@@ -100,8 +101,10 @@ appropiate||appropriate
 appropriatly||appropriately
 approriate||appropriate
 approriately||appropriately
+apropriate||appropriate
 aquainted||acquainted
 aquired||acquired
+aquisition||acquisition
 arbitary||arbitrary
 architechture||architecture
 arguement||argument
@@ -111,6 +114,8 @@ arne't||aren't
 arraival||arrival
 artifical||artificial
 artillary||artillery
+asign||assign
+assertation||assertion
 assiged||assigned
 assigment||assignment
 assigments||assignments
@@ -136,6 +141,7 @@ automatize||automate
 automatized||automated
 automatizes||automates
 autonymous||autonomous
+auxillary||auxiliary
 auxilliary||auxiliary
 avaiable||available
 avaible||available
@@ -234,6 +240,8 @@ compatability||compatibility
 compatable||compatible
 compatibiliy||compatibility
 compatibilty||compatibility
+compatiblity||compatibility
+competion||completion
 compilant||compliant
 compleatly||completely
 completly||completely
@@ -294,6 +302,7 @@ defferred||deferred
 definate||definite
 definately||definitely
 defintion||definition
+defintions||definitions
 defualt||default
 defult||default
 deivce||device
@@ -309,6 +318,7 @@ depreacted||deprecated
 depreacte||deprecate
 desactivate||deactivate
 desciptors||descriptors
+descripton||description
 descrition||description
 descritptor||descriptor
 desctiptor||descriptor
@@ -330,6 +340,7 @@ devided||divided
 deviece||device
 diable||disable
 dictionnary||dictionary
+didnt||didn't
 diferent||different
 differrence||difference
 difinition||definition
@@ -347,6 +358,7 @@ docuentation||documentation
 documantation||documentation
 documentaion||documentation
 documment||document
+doesnt||doesn't
 dorp||drop
 dosen||doesn
 downlad||download
@@ -453,11 +465,13 @@ grahical||graphical
 grahpical||graphical
 grapic||graphic
 guage||gauge
+guarenteed||guaranteed
 guarentee||guarantee
 halfs||halves
 hander||handler
 handfull||handful
 hanled||handled
+happend||happened
 harware||hardware
 heirarchically||hierarchically
 helpfull||helpful
@@ -515,6 +529,7 @@ initialzed||initialized
 initilization||initialization
 initilize||initialize
 inofficial||unofficial
+insititute||institute
 instal||install
 inteface||interface
 integreated||integrated
@@ -549,6 +564,7 @@ invididual||individual
 invokation||invocation
 invokations||invocations
 irrelevent||irrelevant
+isnt||isn't
 isssue||issue
 itslef||itself
 jave||java
@@ -561,6 +577,7 @@ langauage||language
 langauge||language
 langugage||language
 lauch||launch
+layed||laid
 leightweight||lightweight
 lengh||length
 lenght||length
@@ -717,6 +734,7 @@ preceeding||preceding
 preceed||precede
 precendence||precedence
 precission||precision
+preemptable||preemptible
 prefered||preferred
 prefferably||preferably
 premption||preemption
@@ -747,6 +765,7 @@ programers||programmers
 programm||program
 programms||programs
 progresss||progress
+promiscous||promiscuous
 promps||prompts
 pronnounced||pronounced
 prononciation||pronunciation
@@ -820,6 +839,7 @@ reseting||resetting
 resizeable||resizable
 resouces||resources
 resoures||resources
+responce||response
 ressizes||resizes
 ressource||resource
 ressources||resources
@@ -872,6 +892,7 @@ setts||sets
 settting||setting
 shotdown||shutdown
 shoud||should
+shouldnt||shouldn't
 shoule||should
 shrinked||shrunk
 siginificantly||significantly
@@ -916,9 +937,11 @@ straming||streaming
 struc||struct
 structres||structures
 stuct||struct
+stucture||structure
 sturcture||structure
 subdirectoires||subdirectories
 suble||subtle
+substract||subtract
 succesfully||successfully
 succesful||successful
 successfull||successful
@@ -1031,6 +1054,7 @@ virtiual||virtual
 visiters||visitors
 vitual||virtual
 wating||waiting
+wether||whether
 whataver||whatever
 whcih||which
 whenver||whenever
-- 
cgit v1.2.3


From 917520e100e1db5e8dd546dd94fef070a31652a5 Mon Sep 17 00:00:00 2001
From: SF Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 4 Sep 2015 15:43:32 -0700
Subject: ntfs: delete unnecessary checks before calling iput()

iput() tests whether its argument is NULL and then returns immediately.
Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Reviewed-by: Anton Altaparmakov <anton@tuxera.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ntfs/super.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index c1128bcbeb5e..d1a853585b53 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2204,17 +2204,12 @@ get_ctx_vol_failed:
 	return true;
 #ifdef NTFS_RW
 iput_usnjrnl_err_out:
-	if (vol->usnjrnl_j_ino)
-		iput(vol->usnjrnl_j_ino);
-	if (vol->usnjrnl_max_ino)
-		iput(vol->usnjrnl_max_ino);
-	if (vol->usnjrnl_ino)
-		iput(vol->usnjrnl_ino);
+	iput(vol->usnjrnl_j_ino);
+	iput(vol->usnjrnl_max_ino);
+	iput(vol->usnjrnl_ino);
 iput_quota_err_out:
-	if (vol->quota_q_ino)
-		iput(vol->quota_q_ino);
-	if (vol->quota_ino)
-		iput(vol->quota_ino);
+	iput(vol->quota_q_ino);
+	iput(vol->quota_ino);
 	iput(vol->extend_ino);
 #endif /* NTFS_RW */
 iput_sec_err_out:
@@ -2223,8 +2218,7 @@ iput_root_err_out:
 	iput(vol->root_ino);
 iput_logfile_err_out:
 #ifdef NTFS_RW
-	if (vol->logfile_ino)
-		iput(vol->logfile_ino);
+	iput(vol->logfile_ino);
 iput_vol_err_out:
 #endif /* NTFS_RW */
 	iput(vol->vol_ino);
@@ -2254,8 +2248,7 @@ iput_mftbmp_err_out:
 	iput(vol->mftbmp_ino);
 iput_mirr_err_out:
 #ifdef NTFS_RW
-	if (vol->mftmirr_ino)
-		iput(vol->mftmirr_ino);
+	iput(vol->mftmirr_ino);
 #endif /* NTFS_RW */
 	return false;
 }
-- 
cgit v1.2.3


From 81cf09edc793688cbf53c3082802571e2018f3ac Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 4 Sep 2015 15:43:35 -0700
Subject: sh: use PFN_DOWN macro

Replace ((x) >> PAGE_SHIFT) with the predefined PFN_DOWN macro.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/sh/mm/init.c | 4 ++--
 arch/sh/mm/numa.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 2790b6a64157..17f486233db0 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 int arch_add_memory(int nid, u64 start, u64 size)
 {
 	pg_data_t *pgdat;
-	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
@@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int arch_remove_memory(u64 start, u64 size)
 {
-	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	struct zone *zone;
 	int ret;
diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c
index bce52ba66206..05713d190247 100644
--- a/arch/sh/mm/numa.c
+++ b/arch/sh/mm/numa.c
@@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end)
 	/* Don't allow bogus node assignment */
 	BUG_ON(nid >= MAX_NUMNODES || nid <= 0);
 
-	start_pfn = start >> PAGE_SHIFT;
-	end_pfn = end >> PAGE_SHIFT;
+	start_pfn = PFN_DOWN(start);
+	end_pfn = PFN_DOWN(end);
 
 	pmb_bolt_mapping((unsigned long)__va(start), start, end - start,
 			 PAGE_KERNEL);
-- 
cgit v1.2.3


From 512f62acbdf1ee81ce4882c85835f5420a1c304c Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:37 -0700
Subject: ocfs2: fix race between dio and recover orphan

During direct io the inode will be added to orphan first and then
deleted from orphan.  There is a race window that the orphan entry will
be deleted twice and thus trigger the BUG when validating
OCFS2_DIO_ORPHANED_FL in ocfs2_del_inode_from_orphan.

ocfs2_direct_IO_write
    ...
    ocfs2_add_inode_to_orphan
    >>>>>>>> race window.
             1) another node may rm the file and then down, this node
             take care of orphan recovery and clear flag
             OCFS2_DIO_ORPHANED_FL.
             2) since rw lock is unlocked, it may race with another
             orphan recovery and append dio.
    ocfs2_del_inode_from_orphan

So take inode mutex lock when recovering orphans and make rw unlock at the
end of aio write in case of append dio.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Weiwei Wang <wangww631@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c    |  9 ++++++---
 fs/ocfs2/file.c    |  2 +-
 fs/ocfs2/inode.h   |  2 --
 fs/ocfs2/journal.c |  8 ++++----
 fs/ocfs2/namei.c   | 42 +++++++++++++-----------------------------
 fs/ocfs2/super.c   |  2 --
 6 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 0f5fd9db8194..1e88ff483702 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -627,10 +627,13 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
 	}
 
-	ocfs2_iocb_clear_rw_locked(iocb);
+	/* Let rw unlock to be done later to protect append direct io write */
+	if (offset + bytes <= i_size_read(inode)) {
+		ocfs2_iocb_clear_rw_locked(iocb);
 
-	level = ocfs2_iocb_rw_locked_level(iocb);
-	ocfs2_rw_unlock(inode, level);
+		level = ocfs2_iocb_rw_locked_level(iocb);
+		ocfs2_rw_unlock(inode, level);
+	}
 }
 
 static int ocfs2_releasepage(struct page *page, gfp_t wait)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2eb11363b1f7..5d384a6cd696 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2416,7 +2416,7 @@ relock:
 	}
 
 no_sync:
-	if (unaligned_dio) {
+	if (unaligned_dio && ocfs2_iocb_is_unaligned_aio(iocb)) {
 		ocfs2_iocb_clear_unaligned_aio(iocb);
 		mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
 	}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5e86b247c821..ca3431ee7f24 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,8 +81,6 @@ struct ocfs2_inode_info
 	tid_t i_sync_tid;
 	tid_t i_datasync_tid;
 
-	wait_queue_head_t append_dio_wq;
-
 	struct dquot *i_dquot[MAXQUOTAS];
 };
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7c099f7032fd..5e5626884433 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2170,6 +2170,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		iter = oi->ip_next_orphan;
 		oi->ip_next_orphan = NULL;
 
+		mutex_lock(&inode->i_mutex);
 		ret = ocfs2_rw_lock(inode, 1);
 		if (ret < 0) {
 			mlog_errno(ret);
@@ -2206,17 +2207,16 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 			ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
 			if (ret)
 				mlog_errno(ret);
-
-			wake_up(&OCFS2_I(inode)->append_dio_wq);
 		} /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
 unlock_inode:
 		ocfs2_inode_unlock(inode, 1);
+		brelse(di_bh);
+		di_bh = NULL;
 unlock_rw:
 		ocfs2_rw_unlock(inode, 1);
 next:
+		mutex_unlock(&inode->i_mutex);
 		iput(inode);
-		brelse(di_bh);
-		di_bh = NULL;
 		inode = iter;
 	}
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 948681e37cfd..e9ea7f23da12 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2601,27 +2601,6 @@ leave:
 	return status;
 }
 
-static int ocfs2_dio_orphan_recovered(struct inode *inode)
-{
-	int ret;
-	struct buffer_head *di_bh = NULL;
-	struct ocfs2_dinode *di = NULL;
-
-	ret = ocfs2_inode_lock(inode, &di_bh, 1);
-	if (ret < 0) {
-		mlog_errno(ret);
-		return 0;
-	}
-
-	di = (struct ocfs2_dinode *) di_bh->b_data;
-	ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
-	ocfs2_inode_unlock(inode, 1);
-	brelse(di_bh);
-
-	return ret;
-}
-
-#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
 int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
 	struct inode *inode)
 {
@@ -2633,7 +2612,6 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
 	handle_t *handle = NULL;
 	struct ocfs2_dinode *di = NULL;
 
-restart:
 	status = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2643,15 +2621,21 @@ restart:
 	di = (struct ocfs2_dinode *) di_bh->b_data;
 	/*
 	 * Another append dio crashed?
-	 * If so, wait for recovery first.
+	 * If so, manually recover it first.
 	 */
 	if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
-		ocfs2_inode_unlock(inode, 1);
-		brelse(di_bh);
-		wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
-				ocfs2_dio_orphan_recovered(inode),
-				msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
-		goto restart;
+		status = ocfs2_truncate_file(inode, di_bh, i_size_read(inode));
+		if (status < 0) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			goto bail_unlock_inode;
+		}
+
+		status = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail_unlock_inode;
+		}
 	}
 
 	status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 403c5660b306..4474ef2bbc96 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1746,8 +1746,6 @@ static void ocfs2_inode_init_once(void *data)
 	ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
 	ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
-	init_waitqueue_head(&oi->append_dio_wq);
-
 	ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
 				  &ocfs2_inode_caching_ops);
 
-- 
cgit v1.2.3


From faaebf18f831c1546bdc65ff8f49d2a73e675ded Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:40 -0700
Subject: ocfs2: fix several issues of append dio

1) Take rw EX lock in case of append dio.
2) Explicitly treat the error code -EIOCBQUEUED as normal.
3) Set di_bh to NULL after brelse if it may be used again later.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Weiwei Wang <wangww631@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 7 ++++++-
 fs/ocfs2/file.c | 5 ++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1e88ff483702..b36dcad3a140 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -860,7 +860,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
 	written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
 				       offset, ocfs2_direct_IO_get_blocks,
 				       ocfs2_dio_end_io, NULL, 0);
-	if (unlikely(written < 0)) {
+	/* overwrite aio may return -EIOCBQUEUED, and it is not an error */
+	if ((written < 0) && (written != -EIOCBQUEUED)) {
 		loff_t i_size = i_size_read(inode);
 
 		if (offset + count > i_size) {
@@ -879,12 +880,14 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
 
 					ocfs2_inode_unlock(inode, 1);
 					brelse(di_bh);
+					di_bh = NULL;
 					goto clean_orphan;
 				}
 			}
 
 			ocfs2_inode_unlock(inode, 1);
 			brelse(di_bh);
+			di_bh = NULL;
 
 			ret = jbd2_journal_force_commit(journal);
 			if (ret < 0)
@@ -939,10 +942,12 @@ clean_orphan:
 		if (tmp_ret < 0) {
 			ret = tmp_ret;
 			mlog_errno(ret);
+			brelse(di_bh);
 			goto out;
 		}
 
 		ocfs2_inode_unlock(inode, 1);
+		brelse(di_bh);
 
 		tmp_ret = jbd2_journal_force_commit(journal);
 		if (tmp_ret < 0) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5d384a6cd696..38fc33922832 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2271,6 +2271,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 	int unaligned_dio = 0;
 	int dropped_dio = 0;
+	int append_write = ((iocb->ki_pos + count) >=
+			i_size_read(inode) ? 1 : 0);
 
 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2290,8 +2292,9 @@ relock:
 	/*
 	 * Concurrent O_DIRECT writes are allowed with
 	 * mount_option "coherency=buffered".
+	 * For append write, we must take rw EX.
 	 */
-	rw_level = (!direct_io || full_coherency);
+	rw_level = (!direct_io || full_coherency || append_write);
 
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
-- 
cgit v1.2.3


From acf8fdbe6afb084666df347602fe4258f1cf5fd5 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:43 -0700
Subject: ocfs2: do not BUG if buffer not uptodate in __ocfs2_journal_access

When storage network is unstable, it may trigger the BUG in
__ocfs2_journal_access because of buffer not uptodate.  We can retry the
write in this case or return error instead of BUG.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reported-by: Zhangguanghui <zhang.guanghui@h3c.com>
Tested-by: Zhangguanghui <zhang.guanghui@h3c.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/journal.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 5e5626884433..3bfd36a23e40 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -668,7 +668,23 @@ static int __ocfs2_journal_access(handle_t *handle,
 		mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n");
 		mlog(ML_ERROR, "b_blocknr=%llu\n",
 		     (unsigned long long)bh->b_blocknr);
-		BUG();
+
+		lock_buffer(bh);
+		/*
+		 * A previous attempt to write this buffer head failed.
+		 * Nothing we can do but to retry the write and hope for
+		 * the best.
+		 */
+		if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) {
+			clear_buffer_write_io_error(bh);
+			set_buffer_uptodate(bh);
+		}
+
+		if (!buffer_uptodate(bh)) {
+			unlock_buffer(bh);
+			return -EIO;
+		}
+		unlock_buffer(bh);
 	}
 
 	/* Set the current transaction information on the ci so
-- 
cgit v1.2.3


From 372a447c4bb8271d128def5f93e3365d5d06b4d8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 4 Sep 2015 15:43:46 -0700
Subject: ocfs2: do not log twice error messages

'o2hb_map_slot_data' and 'o2hb_populate_slot_data' are called from only
one place, in 'o2hb_region_dev_write'.  Return value is checked and
'mlog_errno' is called to log a message if it is not 0.

So there is no need to call 'mlog_errno' directly within these functions.
This would result on logging the message twice.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 140de3c93d2e..f97306453a0b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1619,17 +1619,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
 	struct o2hb_disk_slot *slot;
 
 	reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL);
-	if (reg->hr_tmp_block == NULL) {
-		mlog_errno(-ENOMEM);
+	if (reg->hr_tmp_block == NULL)
 		return -ENOMEM;
-	}
 
 	reg->hr_slots = kcalloc(reg->hr_blocks,
 				sizeof(struct o2hb_disk_slot), GFP_KERNEL);
-	if (reg->hr_slots == NULL) {
-		mlog_errno(-ENOMEM);
+	if (reg->hr_slots == NULL)
 		return -ENOMEM;
-	}
 
 	for(i = 0; i < reg->hr_blocks; i++) {
 		slot = &reg->hr_slots[i];
@@ -1645,17 +1641,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg)
 
 	reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *),
 				    GFP_KERNEL);
-	if (!reg->hr_slot_data) {
-		mlog_errno(-ENOMEM);
+	if (!reg->hr_slot_data)
 		return -ENOMEM;
-	}
 
 	for(i = 0; i < reg->hr_num_pages; i++) {
 		page = alloc_page(GFP_KERNEL);
-		if (!page) {
-			mlog_errno(-ENOMEM);
+		if (!page)
 			return -ENOMEM;
-		}
 
 		reg->hr_slot_data[i] = page;
 
@@ -1687,10 +1679,8 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
 	struct o2hb_disk_heartbeat_block *hb_block;
 
 	ret = o2hb_read_slots(reg, reg->hr_blocks);
-	if (ret) {
-		mlog_errno(ret);
+	if (ret)
 		goto out;
-	}
 
 	/* We only want to get an idea of the values initially in each
 	 * slot, so we do no verification - o2hb_check_slot will
-- 
cgit v1.2.3


From bf59e6623a3a92a2bf428f2d6592c81aae6317e1 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:49 -0700
Subject: ocfs2: clean up unused local variables in ocfs2_file_write_iter

Since commit 86b9c6f3f891 ("ocfs2: remove filesize checks for sync I/O
journal commit") removes filesize checks for sync I/O journal commit,
variables old_size and old_clusters are not actually used any more.  So
clean them up.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 38fc33922832..c4a99fb61c3e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2262,8 +2262,6 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 	ssize_t written = 0;
 	ssize_t ret;
 	size_t count = iov_iter_count(from), orig_count;
-	loff_t old_size;
-	u32 old_clusters;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2367,13 +2365,6 @@ relock:
 		ocfs2_iocb_set_unaligned_aio(iocb);
 	}
 
-	/*
-	 * To later detect whether a journal commit for sync writes is
-	 * necessary, we sample i_size, and cluster count here.
-	 */
-	old_size = i_size_read(inode);
-	old_clusters = OCFS2_I(inode)->ip_clusters;
-
 	/* communicate with ocfs2_dio_end_io */
 	ocfs2_iocb_set_rw_locked(iocb, rw_level);
 
-- 
cgit v1.2.3


From 3cb2ec43f63c42412a18620f1226eb4aa434a7a8 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:52 -0700
Subject: ocfs2: adjust code to match locking/unlocking order

Unlocking order in ocfs2_unlink and ocfs2_rename mismatches the
corresponding locking order, although it won't cause issues, adjust the
code so that it looks more reasonable.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e9ea7f23da12..97c47d71efa7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1035,11 +1035,6 @@ leave:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	if (child_locked)
-		ocfs2_inode_unlock(inode, 1);
-
-	ocfs2_inode_unlock(dir, 1);
-
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
@@ -1047,6 +1042,11 @@ leave:
 		iput(orphan_dir);
 	}
 
+	if (child_locked)
+		ocfs2_inode_unlock(inode, 1);
+
+	ocfs2_inode_unlock(dir, 1);
+
 	brelse(fe_bh);
 	brelse(parent_node_bh);
 
@@ -1633,21 +1633,9 @@ static int ocfs2_rename(struct inode *old_dir,
 	ocfs2_dentry_move(old_dentry, new_dentry, old_dir, new_dir);
 	status = 0;
 bail:
-	if (rename_lock)
-		ocfs2_rename_unlock(osb);
-
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	if (parents_locked)
-		ocfs2_double_unlock(old_dir, new_dir);
-
-	if (old_child_locked)
-		ocfs2_inode_unlock(old_inode, 1);
-
-	if (new_child_locked)
-		ocfs2_inode_unlock(new_inode, 1);
-
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
@@ -1655,6 +1643,18 @@ bail:
 		iput(orphan_dir);
 	}
 
+	if (new_child_locked)
+		ocfs2_inode_unlock(new_inode, 1);
+
+	if (old_child_locked)
+		ocfs2_inode_unlock(old_inode, 1);
+
+	if (parents_locked)
+		ocfs2_double_unlock(old_dir, new_dir);
+
+	if (rename_lock)
+		ocfs2_rename_unlock(osb);
+
 	if (new_inode)
 		sync_mapping_buffers(old_inode->i_mapping);
 
-- 
cgit v1.2.3


From 914a9b74295774b92409fbc3e0abcfa9185d9469 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:54 -0700
Subject: ocfs2: remove unneeded code in ocfs2_dlm_init

status is already initialized and it will only be 0 or negatives in the
code flow.  So remove the unneeded assignment after the lable 'local'.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmglue.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 23157e40dd74..1c91103c1333 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3035,8 +3035,6 @@ local:
 	ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
 
 	osb->cconn = conn;
-
-	status = 0;
 bail:
 	if (status < 0) {
 		ocfs2_dlm_shutdown_debug(osb);
-- 
cgit v1.2.3


From cdd09f49cb271d95cbe69ef886459e0490040e98 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:43:57 -0700
Subject: ocfs2: fix BUG when o2hb_register_callback fails

In dlm_register_domain_handlers, if o2hb_register_callback fails, it
will call dlm_unregister_domain_handlers to unregister.  This will
trigger the BUG_ON in o2hb_unregister_callback because hc_magic is 0.
So we should call o2hb_setup_callback to initialize hc first.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmdomain.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7df88a6dd626..4f750701bd9a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1725,12 +1725,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 
 	o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
 			    dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
+	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
+			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
+
 	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_down);
 	if (status)
 		goto bail;
 
-	o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
-			    dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
 	status = o2hb_register_callback(dlm->name, &dlm->dlm_hb_up);
 	if (status)
 		goto bail;
-- 
cgit v1.2.3


From 0e3d9eafb86183a33efc42f0beff5afceebbafba Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:44:00 -0700
Subject: ocfs2: remove unneeded code in dlm_register_domain_handlers

The last goto statement is unneeded, so remove it.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmdomain.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 4f750701bd9a..019459b20aeb 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1846,8 +1846,6 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
 					sizeof(struct dlm_exit_domain),
 					dlm_begin_exit_domain_handler,
 					dlm, NULL, &dlm->dlm_domain_handlers);
-	if (status)
-		goto bail;
 
 bail:
 	if (status)
-- 
cgit v1.2.3


From f83c7b5e9fd633fe91128af116e6472a8c4d29a5 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:44:03 -0700
Subject: ocfs2/dlm: use list_for_each_entry instead of list_for_each

Use list_for_each_entry instead of list_for_each to simplify code.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ce12e0b1a31f..d0e436dc6437 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1776,7 +1776,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 				     struct dlm_migratable_lockres *mres)
 {
 	struct dlm_migratable_lock *ml;
-	struct list_head *queue, *iter;
+	struct list_head *queue;
 	struct list_head *tmpq = NULL;
 	struct dlm_lock *newlock = NULL;
 	struct dlm_lockstatus *lksb = NULL;
@@ -1821,9 +1821,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
 			spin_lock(&res->spinlock);
 			for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
 				tmpq = dlm_list_idx_to_ptr(res, j);
-				list_for_each(iter, tmpq) {
-					lock = list_entry(iter,
-						  struct dlm_lock, list);
+				list_for_each_entry(lock, tmpq, list) {
 					if (lock->ml.cookie == ml->cookie)
 						break;
 					lock = NULL;
-- 
cgit v1.2.3


From 807a7907114c7c703017ed7a96477a2eeb0d08e0 Mon Sep 17 00:00:00 2001
From: jiangyiwen <jiangyiwen@huawei.com>
Date: Fri, 4 Sep 2015 15:44:06 -0700
Subject: ocfs2: set filesytem read-only when ocfs2_delete_entry failed.

In ocfs2_rename, it will lead to an inode with two entried(old and new) if
ocfs2_delete_entry(old) failed.  Thus, filesystem will be inconsistent.

The case is described below:

ocfs2_rename
    -> ocfs2_start_trans
    -> ocfs2_add_entry(new)
    -> ocfs2_delete_entry(old)
        -> __ocfs2_journal_access *failed* because of -ENOMEM
    -> ocfs2_commit_trans

So filesystem should be set to read-only at the moment.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 97c47d71efa7..1c43993e81b0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1569,12 +1569,25 @@ static int ocfs2_rename(struct inode *old_dir,
 	status = ocfs2_find_entry(old_dentry->d_name.name,
 				  old_dentry->d_name.len, old_dir,
 				  &old_entry_lookup);
-	if (status)
+	if (status) {
+		if (!is_journal_aborted(osb->journal->j_journal)) {
+			ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+					"is not deleted.",
+					new_dentry->d_name.len, new_dentry->d_name.name,
+					old_dentry->d_name.len, old_dentry->d_name.name);
+		}
 		goto bail;
+	}
 
 	status = ocfs2_delete_entry(handle, old_dir, &old_entry_lookup);
 	if (status < 0) {
 		mlog_errno(status);
+		if (!is_journal_aborted(osb->journal->j_journal)) {
+			ocfs2_error(osb->sb, "new entry %.*s is added, but old entry %.*s "
+					"is not deleted.",
+					new_dentry->d_name.len, new_dentry->d_name.name,
+					old_dentry->d_name.len, old_dentry->d_name.name);
+		}
 		goto bail;
 	}
 
-- 
cgit v1.2.3


From 0f5e7b41f91814447defc34e915fc5d6e52266d9 Mon Sep 17 00:00:00 2001
From: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Date: Fri, 4 Sep 2015 15:44:08 -0700
Subject: ocfs2: trusted xattr missing CAP_SYS_ADMIN check

The trusted extended attributes are only visible to the process which
hvae CAP_SYS_ADMIN capability but the check is missing in ocfs2
xattr_handler trusted list.  The check is important because this will be
used for implementing mechanisms in the userspace for which other
ordinary processes should not have access to.

Signed-off-by: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Taesoo kim <taesoo@gatech.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 889f3796a0d7..a24f264b2fc4 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7334,6 +7334,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
 	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
+	if (!capable(CAP_SYS_ADMIN))
+		return 0;
+
 	if (list && total_len <= list_size) {
 		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
 		memcpy(list + prefix_len, name, name_len);
-- 
cgit v1.2.3


From 513e2dae9422223072ed3887e91efebec2fc0a01 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Fri, 4 Sep 2015 15:44:11 -0700
Subject: ocfs2: flush inode data to disk and free inode when i_count becomes
 zero

Disk inode deletion may be heavily delayed when one node unlink a file
after the same dentry is freed on another node(say N1) because of memory
shrink but inode is left in memory.  This inode can only be freed while
N1 doing the orphan scan work.

However, N1 may skip orphan scan for several times because other nodes
may do the work earlier.  In our tests, it may take 1 hour on 4 nodes
cluster and it hurts the user experience.  So we think the inode should
be freed after the data flushed to disk when i_count becomes zero to
avoid such circumstances.

Signed-off-by: Joyce.xue <xuejiufei@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/inode.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index b254416dc8d9..4e69f3cbc5f1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1191,17 +1191,19 @@ void ocfs2_evict_inode(struct inode *inode)
 int ocfs2_drop_inode(struct inode *inode)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	int res;
 
 	trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
 				inode->i_nlink, oi->ip_flags);
 
-	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
-		res = 1;
-	else
-		res = generic_drop_inode(inode);
+	assert_spin_locked(&inode->i_lock);
+	inode->i_state |= I_WILL_FREE;
+	spin_unlock(&inode->i_lock);
+	write_inode_now(inode, 1);
+	spin_lock(&inode->i_lock);
+	WARN_ON(inode->i_state & I_NEW);
+	inode->i_state &= ~I_WILL_FREE;
 
-	return res;
+	return 1;
 }
 
 /*
-- 
cgit v1.2.3


From 7d0fb9148ab6f52006de7cce18860227594ba872 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Fri, 4 Sep 2015 15:44:11 -0700
Subject: ocfs2: add errors=continue

OCFS2 is often used in high-availaibility systems.  However, ocfs2
converts the filesystem to read-only at the drop of the hat.  This may
not be necessary, since turning the filesystem read-only would affect
other running processes as well, decreasing availability.

This attempt is to add errors=continue, which would return the EIO to
the calling process and terminate furhter processing so that the
filesystem is not corrupted further.  However, the filesystem is not
converted to read-only.

As a future plan, I intend to create a small utility or extend
fsck.ocfs2 to fix small errors such as in the inode.  The input to the
utility such as the inode can come from the kernel logs so we don't have
to schedule a downtime for fixing small-enough errors.

The patch changes the ocfs2_error to return an error.  The error
returned depends on the mount option set.  If none is set, the default
is to turn the filesystem read-only.

Perhaps errors=continue is not the best option name.  Historically it is
used for making an attempt to progress in the current process itself.
Should we call it errors=eio? or errors=killproc? Suggestions/Comments
welcome.

Sources are available at:
  https://github.com/goldwynr/linux/tree/error-cont

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ocfs2.h |  2 ++
 fs/ocfs2/super.c | 63 +++++++++++++++++++++++++++++++++++++++-----------------
 fs/ocfs2/super.h |  2 +-
 3 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 690ddc60189b..7a0126267847 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -286,6 +286,8 @@ enum ocfs2_mount_options
 	OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
 
 	OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
+	OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
+	OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
 };
 
 #define OCFS2_OSB_SOFT_RO	0x0001
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4474ef2bbc96..e79058ecfb4b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -192,6 +192,7 @@ enum {
 	Opt_resv_level,
 	Opt_dir_resv_level,
 	Opt_journal_async_commit,
+	Opt_err_cont,
 	Opt_err,
 };
 
@@ -224,6 +225,7 @@ static const match_table_t tokens = {
 	{Opt_resv_level, "resv_level=%u"},
 	{Opt_dir_resv_level, "dir_resv_level=%u"},
 	{Opt_journal_async_commit, "journal_async_commit"},
+	{Opt_err_cont, "errors=continue"},
 	{Opt_err, NULL}
 };
 
@@ -1330,10 +1332,19 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->mount_opt |= OCFS2_MOUNT_NOINTR;
 			break;
 		case Opt_err_panic:
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
 			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_PANIC;
 			break;
 		case Opt_err_ro:
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_CONT;
 			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_ROFS;
+			break;
+		case Opt_err_cont:
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_ROFS;
+			mopt->mount_opt &= ~OCFS2_MOUNT_ERRORS_PANIC;
+			mopt->mount_opt |= OCFS2_MOUNT_ERRORS_CONT;
 			break;
 		case Opt_data_ordered:
 			mopt->mount_opt &= ~OCFS2_MOUNT_DATA_WRITEBACK;
@@ -1530,6 +1541,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 
 	if (opts & OCFS2_MOUNT_ERRORS_PANIC)
 		seq_printf(s, ",errors=panic");
+	else if (opts & OCFS2_MOUNT_ERRORS_CONT)
+		seq_printf(s, ",errors=continue");
 	else
 		seq_printf(s, ",errors=remount-ro");
 
@@ -2539,31 +2552,43 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	memset(osb, 0, sizeof(struct ocfs2_super));
 }
 
-/* Put OCFS2 into a readonly state, or (if the user specifies it),
- * panic(). We do not support continue-on-error operation. */
-static void ocfs2_handle_error(struct super_block *sb)
+/* Depending on the mount option passed, perform one of the following:
+ * Put OCFS2 into a readonly state (default)
+ * Return EIO so that only the process errs
+ * Fix the error as if fsck.ocfs2 -y
+ * panic
+ */
+static int ocfs2_handle_error(struct super_block *sb)
 {
 	struct ocfs2_super *osb = OCFS2_SB(sb);
-
-	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC)
-		panic("OCFS2: (device %s): panic forced after error\n",
-		      sb->s_id);
+	int rv = 0;
 
 	ocfs2_set_osb_flag(osb, OCFS2_OSB_ERROR_FS);
+	pr_crit("On-disk corruption discovered. "
+		"Please run fsck.ocfs2 once the filesystem is unmounted.\n");
 
-	if (sb->s_flags & MS_RDONLY &&
-	    (ocfs2_is_soft_readonly(osb) ||
-	     ocfs2_is_hard_readonly(osb)))
-		return;
-
-	printk(KERN_CRIT "File system is now read-only due to the potential "
-	       "of on-disk corruption. Please run fsck.ocfs2 once the file "
-	       "system is unmounted.\n");
-	sb->s_flags |= MS_RDONLY;
-	ocfs2_set_ro_flag(osb, 0);
+	if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_PANIC) {
+		panic("OCFS2: (device %s): panic forced after error\n",
+		      sb->s_id);
+	} else if (osb->s_mount_opt & OCFS2_MOUNT_ERRORS_CONT) {
+		pr_crit("OCFS2: Returning error to the calling process.\n");
+		rv = -EIO;
+	} else { /* default option */
+		rv = -EROFS;
+		if (sb->s_flags & MS_RDONLY &&
+				(ocfs2_is_soft_readonly(osb) ||
+				 ocfs2_is_hard_readonly(osb)))
+			return rv;
+
+		pr_crit("OCFS2: File system is now read-only.\n");
+		sb->s_flags |= MS_RDONLY;
+		ocfs2_set_ro_flag(osb, 0);
+	}
+
+	return rv;
 }
 
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
 		  const char *fmt, ...)
 {
 	struct va_format vaf;
@@ -2580,7 +2605,7 @@ void __ocfs2_error(struct super_block *sb, const char *function,
 
 	va_end(args);
 
-	ocfs2_handle_error(sb);
+	return ocfs2_handle_error(sb);
 }
 
 /* Handle critical errors. This is intentionally more drastic than
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 74ff74cf78fe..c1c87d90542c 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -32,7 +32,7 @@ int ocfs2_publish_get_mount_state(struct ocfs2_super *osb,
 				  int node_num);
 
 __printf(3, 4)
-void __ocfs2_error(struct super_block *sb, const char *function,
+int __ocfs2_error(struct super_block *sb, const char *function,
 		   const char *fmt, ...);
 
 #define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
-- 
cgit v1.2.3


From 17a5b9ab32fe0464e7f556e28a2b49d2023fb533 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Fri, 4 Sep 2015 15:44:17 -0700
Subject: ocfs2: acknowledge return value of ocfs2_error()

Caveat: This may return -EROFS for a read case, which seems wrong.  This
is happening even without this patch series though.  Should we convert
EROFS to EIO?

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c        | 16 ++++++++--------
 fs/ocfs2/dir.c          | 25 +++++++++----------------
 fs/ocfs2/inode.c        |  8 ++++----
 fs/ocfs2/move_extents.c |  3 +--
 fs/ocfs2/refcounttree.c | 42 ++++++++++++++++++------------------------
 fs/ocfs2/suballoc.c     | 25 ++++++-------------------
 fs/ocfs2/xattr.c        | 15 +++++----------
 7 files changed, 51 insertions(+), 83 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 5997c00a1515..9a0fd494fe74 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -908,32 +908,32 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
 	 */
 
 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Extent block #%llu has bad signature %.*s",
 			    (unsigned long long)bh->b_blocknr, 7,
 			    eb->h_signature);
-		return -EINVAL;
+		goto bail;
 	}
 
 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Extent block #%llu has an invalid h_blkno "
 			    "of %llu",
 			    (unsigned long long)bh->b_blocknr,
 			    (unsigned long long)le64_to_cpu(eb->h_blkno));
-		return -EINVAL;
+		goto bail;
 	}
 
 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Extent block #%llu has an invalid "
 			    "h_fs_generation of #%u",
 			    (unsigned long long)bh->b_blocknr,
 			    le32_to_cpu(eb->h_fs_generation));
-		return -EINVAL;
+		goto bail;
 	}
-
-	return 0;
+bail:
+	return rc;
 }
 
 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 02878a83f0b4..25f03af09237 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -480,8 +480,7 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
 
 	trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
 	if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
-		rc = -EINVAL;
-		ocfs2_error(dir->i_sb,
+		rc = ocfs2_error(dir->i_sb,
 			    "Invalid dirblock #%llu: "
 			    "signature = %.*s\n",
 			    (unsigned long long)bh->b_blocknr, 7,
@@ -489,8 +488,7 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
 		goto out;
 	}
 	if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
-		rc = -EINVAL;
-		ocfs2_error(dir->i_sb,
+		rc = ocfs2_error(dir->i_sb,
 			    "Directory block #%llu has an invalid "
 			    "db_blkno of %llu",
 			    (unsigned long long)bh->b_blocknr,
@@ -499,8 +497,7 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
 	}
 	if (le64_to_cpu(trailer->db_parent_dinode) !=
 	    OCFS2_I(dir)->ip_blkno) {
-		rc = -EINVAL;
-		ocfs2_error(dir->i_sb,
+		rc = ocfs2_error(dir->i_sb,
 			    "Directory block #%llu on dinode "
 			    "#%llu has an invalid parent_dinode "
 			    "of %llu",
@@ -604,14 +601,13 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
 	}
 
 	if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
-		ocfs2_error(sb,
+		ret = ocfs2_error(sb,
 			    "Dir Index Root # %llu has bad signature %.*s",
 			    (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
 			    7, dx_root->dr_signature);
-		return -EINVAL;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int ocfs2_read_dx_root(struct inode *dir, struct ocfs2_dinode *di,
@@ -648,12 +644,11 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
 	}
 
 	if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
-		ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
+		ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
 			    7, dx_leaf->dl_signature);
-		return -EROFS;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int ocfs2_read_dx_leaf(struct inode *dir, u64 blkno,
@@ -812,11 +807,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
 		el = &eb->h_list;
 
 		if (el->l_tree_depth) {
-			ocfs2_error(inode->i_sb,
+			ret = ocfs2_error(inode->i_sb,
 				    "Inode %lu has non zero tree depth in "
 				    "btree tree block %llu\n", inode->i_ino,
 				    (unsigned long long)eb_bh->b_blocknr);
-			ret = -EROFS;
 			goto out;
 		}
 	}
@@ -832,11 +826,10 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
 	}
 
 	if (!found) {
-		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+		ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
 			    "record (%u, %u, 0) in btree", inode->i_ino,
 			    le32_to_cpu(rec->e_cpos),
 			    ocfs2_rec_clusters(el, rec));
-		ret = -EROFS;
 		goto out;
 	}
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4e69f3cbc5f1..7868f7e7c455 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1352,21 +1352,21 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 	rc = -EINVAL;
 
 	if (!OCFS2_IS_VALID_DINODE(di)) {
-		ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+		rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
 			    (unsigned long long)bh->b_blocknr, 7,
 			    di->i_signature);
 		goto bail;
 	}
 
 	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
-		ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+		rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
 			    (unsigned long long)bh->b_blocknr,
 			    (unsigned long long)le64_to_cpu(di->i_blkno));
 		goto bail;
 	}
 
 	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
 			    (unsigned long long)bh->b_blocknr);
 		goto bail;
@@ -1374,7 +1374,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 
 	if (le32_to_cpu(di->i_fs_generation) !=
 	    OCFS2_SB(sb)->fs_generation) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Invalid dinode #%llu: fs_generation is %u\n",
 			    (unsigned long long)bh->b_blocknr,
 			    le32_to_cpu(di->i_fs_generation));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 56a768d06aa6..70dd0ec7b7e9 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -99,11 +99,10 @@ static int __ocfs2_move_extent(handle_t *handle,
 
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1) {
-		ocfs2_error(inode->i_sb,
+		ret = ocfs2_error(inode->i_sb,
 			    "Inode %llu has an extent at cpos %u which can no "
 			    "longer be found.\n",
 			    (unsigned long long)ino, cpos);
-		ret = -EROFS;
 		goto out;
 	}
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7dc818b87cd8..b404dbde3fe4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -102,32 +102,32 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
 
 
 	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Refcount block #%llu has bad signature %.*s",
 			    (unsigned long long)bh->b_blocknr, 7,
 			    rb->rf_signature);
-		return -EINVAL;
+		goto out;
 	}
 
 	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Refcount block #%llu has an invalid rf_blkno "
 			    "of %llu",
 			    (unsigned long long)bh->b_blocknr,
 			    (unsigned long long)le64_to_cpu(rb->rf_blkno));
-		return -EINVAL;
+		goto out;
 	}
 
 	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-		ocfs2_error(sb,
+		rc = ocfs2_error(sb,
 			    "Refcount block #%llu has an invalid "
 			    "rf_fs_generation of #%u",
 			    (unsigned long long)bh->b_blocknr,
 			    le32_to_cpu(rb->rf_fs_generation));
-		return -EINVAL;
+		goto out;
 	}
-
-	return 0;
+out:
+	return rc;
 }
 
 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
@@ -1102,12 +1102,11 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
 		el = &eb->h_list;
 
 		if (el->l_tree_depth) {
-			ocfs2_error(sb,
-			"refcount tree %llu has non zero tree "
-			"depth in leaf btree tree block %llu\n",
-			(unsigned long long)ocfs2_metadata_cache_owner(ci),
-			(unsigned long long)eb_bh->b_blocknr);
-			ret = -EROFS;
+			ret = ocfs2_error(sb,
+				"refcount tree %llu has non zero tree "
+				"depth in leaf btree tree block %llu\n",
+				(unsigned long long)ocfs2_metadata_cache_owner(ci),
+				(unsigned long long)eb_bh->b_blocknr);
 			goto out;
 		}
 	}
@@ -2359,10 +2358,9 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
 					   cpos, len, phys);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
 			    "tree, but the feature bit is not set in the "
 			    "super block.", inode->i_ino);
-		ret = -EROFS;
 		goto out;
 	}
 
@@ -2545,10 +2543,9 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
 			    "tree, but the feature bit is not set in the "
 			    "super block.", inode->i_ino);
-		ret = -EROFS;
 		goto out;
 	}
 
@@ -2672,11 +2669,10 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 		el = &eb->h_list;
 
 		if (el->l_tree_depth) {
-			ocfs2_error(inode->i_sb,
+			ret = ocfs2_error(inode->i_sb,
 				    "Inode %lu has non zero tree depth in "
 				    "leaf block %llu\n", inode->i_ino,
 				    (unsigned long long)eb_bh->b_blocknr);
-			ret = -EROFS;
 			goto out;
 		}
 	}
@@ -3106,11 +3102,10 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
 
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1) {
-		ocfs2_error(sb,
+		ret = ocfs2_error(sb,
 			    "Inode %llu has an extent at cpos %u which can no "
 			    "longer be found.\n",
 			    (unsigned long long)ino, cpos);
-		ret = -EROFS;
 		goto out;
 	}
 
@@ -3376,10 +3371,9 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
+		return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
 			    "tree, but the feature bit is not set in the "
 			    "super block.", inode->i_ino);
-		return -EROFS;
 	}
 
 	ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 4479029630bb..e4bb00110e91 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -171,7 +171,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 		if (resize)					\
 			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
 		else							\
-			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+			return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
 	} while (0)
 
 static int ocfs2_validate_gd_self(struct super_block *sb,
@@ -184,7 +184,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 		do_error("Group descriptor #%llu has bad signature %.*s",
 			 (unsigned long long)bh->b_blocknr, 7,
 			 gd->bg_signature);
-		return -EINVAL;
 	}
 
 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
@@ -192,7 +191,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 			 "of %llu",
 			 (unsigned long long)bh->b_blocknr,
 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
-		return -EINVAL;
 	}
 
 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
@@ -200,7 +198,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 			 "fs_generation of #%u",
 			 (unsigned long long)bh->b_blocknr,
 			 le32_to_cpu(gd->bg_generation));
-		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
@@ -209,7 +206,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits),
 			 le16_to_cpu(gd->bg_free_bits_count));
-		return -EINVAL;
 	}
 
 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
@@ -218,7 +214,6 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits),
 			 8 * le16_to_cpu(gd->bg_size));
-		return -EINVAL;
 	}
 
 	return 0;
@@ -238,7 +233,6 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 			 (unsigned long long)bh->b_blocknr,
 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 			 (unsigned long long)le64_to_cpu(di->i_blkno));
-		return -EINVAL;
 	}
 
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
@@ -246,7 +240,6 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 		do_error("Group descriptor #%llu has bit count of %u",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits));
-		return -EINVAL;
 	}
 
 	/* In resize, we may meet the case bg_chain == cl_next_free_rec. */
@@ -257,7 +250,6 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 		do_error("Group descriptor #%llu has bad chain %u",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_chain));
-		return -EINVAL;
 	}
 
 	return 0;
@@ -384,11 +376,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
 	struct super_block * sb = alloc_inode->i_sb;
 
 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
-		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
+		status = ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
 			    "b_blocknr (%llu)",
 			    (unsigned long long)group_blkno,
 			    (unsigned long long) bg_bh->b_blocknr);
-		status = -EIO;
 		goto bail;
 	}
 
@@ -834,9 +825,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
-		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
+		status = ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
-		status = -EIO;
 		goto bail;
 	}
 
@@ -1370,12 +1360,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
 
 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
 			    " count %u but claims %u are freed. num_bits %d",
 			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
 			    le16_to_cpu(bg->bg_bits),
 			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
-		return -EROFS;
 	}
 	while(num_bits--)
 		ocfs2_set_bit(bit_off++, bitmap);
@@ -1905,13 +1894,12 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
-		ocfs2_error(ac->ac_inode->i_sb,
+		status = ocfs2_error(ac->ac_inode->i_sb,
 			    "Chain allocator dinode %llu has %u used "
 			    "bits but only %u total.",
 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
 			    le32_to_cpu(fe->id1.bitmap1.i_used),
 			    le32_to_cpu(fe->id1.bitmap1.i_total));
-		status = -EIO;
 		goto bail;
 	}
 
@@ -2429,12 +2417,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
+		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
 			    " count %u but claims %u are freed. num_bits %d",
 			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
 			    le16_to_cpu(bg->bg_bits),
 			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
-		return -EROFS;
 	}
 
 	if (undo_fn)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a24f264b2fc4..5944a311bb94 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -499,30 +499,27 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
 	 */
 
 	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
-		ocfs2_error(sb,
+		return ocfs2_error(sb,
 			    "Extended attribute block #%llu has bad "
 			    "signature %.*s",
 			    (unsigned long long)bh->b_blocknr, 7,
 			    xb->xb_signature);
-		return -EINVAL;
 	}
 
 	if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
-		ocfs2_error(sb,
+		return ocfs2_error(sb,
 			    "Extended attribute block #%llu has an "
 			    "invalid xb_blkno of %llu",
 			    (unsigned long long)bh->b_blocknr,
 			    (unsigned long long)le64_to_cpu(xb->xb_blkno));
-		return -EINVAL;
 	}
 
 	if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
-		ocfs2_error(sb,
+		return ocfs2_error(sb,
 			    "Extended attribute block #%llu has an invalid "
 			    "xb_fs_generation of #%u",
 			    (unsigned long long)bh->b_blocknr,
 			    le32_to_cpu(xb->xb_fs_generation));
-		return -EINVAL;
 	}
 
 	return 0;
@@ -3694,11 +3691,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
 		el = &eb->h_list;
 
 		if (el->l_tree_depth) {
-			ocfs2_error(inode->i_sb,
+			ret = ocfs2_error(inode->i_sb,
 				    "Inode %lu has non zero tree depth in "
 				    "xattr tree block %llu\n", inode->i_ino,
 				    (unsigned long long)eb_bh->b_blocknr);
-			ret = -EROFS;
 			goto out;
 		}
 	}
@@ -3713,11 +3709,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
 	}
 
 	if (!e_blkno) {
-		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+		ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
 			    "record (%u, %u, 0) in xattr", inode->i_ino,
 			    le32_to_cpu(rec->e_cpos),
 			    ocfs2_rec_clusters(el, rec));
-		ret = -EROFS;
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 34237681e02ad1617138926f437d0a147249ec13 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Fri, 4 Sep 2015 15:44:20 -0700
Subject: ocfs2: clear the rest of the buffers on error

In case a validation fails, clear the rest of the buffers and return the
error to the calling function.

This also facilitates bubbling up the error originating from ocfs2_error
to calling functions.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/buffer_head_io.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 1edcb141f639..fe50ded1b4ce 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -316,6 +316,12 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 		bh = bhs[i];
 
 		if (!(flags & OCFS2_BH_READAHEAD)) {
+			if (status) {
+				/* Clear the rest of the buffers on error */
+				put_bh(bh);
+				bhs[i] = NULL;
+				continue;
+			}
 			/* We know this can't have changed as we hold the
 			 * owner sem. Avoid doing any work on the bh if the
 			 * journal has it. */
-- 
cgit v1.2.3


From 6ab855a99b735c227ad1e0deda636833f41c5b87 Mon Sep 17 00:00:00 2001
From: WeiWei Wang <wangww631@huawei.com>
Date: Fri, 4 Sep 2015 15:44:23 -0700
Subject: ocfs2: add ip_alloc_sem in direct IO to protect allocation changes

In ocfs2, ip_alloc_sem is used to protect allocation changes on the
node.  In direct IO, we add ip_alloc_sem to protect date consistent
between direct-io and ocfs2_truncate_file race (buffer io use
ip_alloc_sem already).  Although inode->i_mutex lock is used to avoid
concurrency of above situation, i think ip_alloc_sem is still needed
because protect allocation changes is significant.

Other filesystem like ext4 also uses rw_semaphore to protect data
consistent between get_block-vs-truncate race by other means, So
ip_alloc_sem in ocfs2 direct io is needed.

Signed-off-by: Weiwei Wang <wangww631@huawei.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b36dcad3a140..a7ab145e2901 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -533,10 +533,14 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
 	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
 	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 					  &contig_blocks, &ext_flags);
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 		     (unsigned long long)iblock);
@@ -557,6 +561,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 
 		alloc_locked = 1;
 
+		down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
 		/* fill hole, allocate blocks can't be larger than the size
 		 * of the hole */
 		clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
@@ -569,6 +575,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		ret = ocfs2_extend_allocation(inode, cpos,
 				clusters_to_alloc, 0);
 		if (ret < 0) {
+			up_write(&OCFS2_I(inode)->ip_alloc_sem);
 			mlog_errno(ret);
 			goto bail;
 		}
@@ -576,11 +583,13 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
 				&contig_blocks, &ext_flags);
 		if (ret < 0) {
+			up_write(&OCFS2_I(inode)->ip_alloc_sem);
 			mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 					(unsigned long long)iblock);
 			ret = -EIO;
 			goto bail;
 		}
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
 	/*
@@ -835,12 +844,17 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
 
 		/* zeroing out the previously allocated cluster tail
 		 * that but not zeroed */
-		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+			down_read(&OCFS2_I(inode)->ip_alloc_sem);
 			ret = ocfs2_direct_IO_zero_extend(osb, inode, offset,
 					zero_len_tail, cluster_align_tail);
-		else
+			up_read(&OCFS2_I(inode)->ip_alloc_sem);
+		} else {
+			down_write(&OCFS2_I(inode)->ip_alloc_sem);
 			ret = ocfs2_direct_IO_extend_no_holes(osb, inode,
 					offset);
+			up_write(&OCFS2_I(inode)->ip_alloc_sem);
+		}
 		if (ret < 0) {
 			mlog_errno(ret);
 			ocfs2_inode_unlock(inode, 1);
-- 
cgit v1.2.3


From 928dda1f9433f024ac48c3d97ae683bf83dd0e42 Mon Sep 17 00:00:00 2001
From: Yiwen Jiang <jiangyiwen@huawei.com>
Date: Fri, 4 Sep 2015 15:44:25 -0700
Subject: ocfs2: fix a tiny case that inode can not removed

When running dirop_fileop_racer we found a case that inode
can not removed.

Two nodes, say Node A and Node B, mount the same ocfs2 volume.  Create
two dirs /race/1/ and /race/2/ in the filesystem.

  Node A                            Node B
  rm -r /race/2/
                                    mv /race/1/ /race/2/
  call ocfs2_unlink(), get
  the EX mode of /race/2/
                                    wait for B unlock /race/2/
  decrease i_nlink of /race/2/ to 0,
  and add inode of /race/2/ into
  orphan dir, unlock /race/2/
                                    got EX mode of /race/2/. because
                                    /race/1/ is dir, so inc i_nlink
                                    of /race/2/ and update into disk,
                                    unlock /race/2/
  because i_nlink of /race/2/
  is not zero, this inode will
  always remain in orphan dir

This patch fixes this case by test whether i_nlink of new dir is zero.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Xue jiufei <xuejiufei@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/namei.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 1c43993e81b0..b7dfac226b1e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1309,6 +1309,11 @@ static int ocfs2_rename(struct inode *old_dir,
 	}
 	parents_locked = 1;
 
+	if (!new_dir->i_nlink) {
+		status = -EACCES;
+		goto bail;
+	}
+
 	/* make sure both dirs have bhs
 	 * get an extra ref on old_dir_bh if old==new */
 	if (!new_dir_bh) {
-- 
cgit v1.2.3


From 72f6fe1fe5a386225cdc30f025681830a63a117e Mon Sep 17 00:00:00 2001
From: "Norton.Zhu" <norton.zhu@huawei.com>
Date: Fri, 4 Sep 2015 15:44:28 -0700
Subject: ocfs2: optimize error handling in dlm_request_join

Currently error handling in dlm_request_join is a little obscure, so
optimize it to promote readability.

If packet.code is invalid, reset it to JOIN_DISALLOW to keep it
meaningful.  It only influences the log printing.

Signed-off-by: Norton.Zhu <norton.zhu@huawei.com>
Cc: Srinivas Eeda <srinivas.eeda@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmdomain.c | 71 ++++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 32 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 019459b20aeb..6918f30d02cd 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1465,39 +1465,46 @@ static int dlm_request_join(struct dlm_ctxt *dlm,
 	if (status == -ENOPROTOOPT) {
 		status = 0;
 		*response = JOIN_OK_NO_MAP;
-	} else if (packet.code == JOIN_DISALLOW ||
-		   packet.code == JOIN_OK_NO_MAP) {
-		*response = packet.code;
-	} else if (packet.code == JOIN_PROTOCOL_MISMATCH) {
-		mlog(ML_NOTICE,
-		     "This node requested DLM locking protocol %u.%u and "
-		     "filesystem locking protocol %u.%u.  At least one of "
-		     "the protocol versions on node %d is not compatible, "
-		     "disconnecting\n",
-		     dlm->dlm_locking_proto.pv_major,
-		     dlm->dlm_locking_proto.pv_minor,
-		     dlm->fs_locking_proto.pv_major,
-		     dlm->fs_locking_proto.pv_minor,
-		     node);
-		status = -EPROTO;
-		*response = packet.code;
-	} else if (packet.code == JOIN_OK) {
-		*response = packet.code;
-		/* Use the same locking protocol as the remote node */
-		dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
-		dlm->fs_locking_proto.pv_minor = packet.fs_minor;
-		mlog(0,
-		     "Node %d responds JOIN_OK with DLM locking protocol "
-		     "%u.%u and fs locking protocol %u.%u\n",
-		     node,
-		     dlm->dlm_locking_proto.pv_major,
-		     dlm->dlm_locking_proto.pv_minor,
-		     dlm->fs_locking_proto.pv_major,
-		     dlm->fs_locking_proto.pv_minor);
 	} else {
-		status = -EINVAL;
-		mlog(ML_ERROR, "invalid response %d from node %u\n",
-		     packet.code, node);
+		*response = packet.code;
+		switch (packet.code) {
+		case JOIN_DISALLOW:
+		case JOIN_OK_NO_MAP:
+			break;
+		case JOIN_PROTOCOL_MISMATCH:
+			mlog(ML_NOTICE,
+			     "This node requested DLM locking protocol %u.%u and "
+			     "filesystem locking protocol %u.%u.  At least one of "
+			     "the protocol versions on node %d is not compatible, "
+			     "disconnecting\n",
+			     dlm->dlm_locking_proto.pv_major,
+			     dlm->dlm_locking_proto.pv_minor,
+			     dlm->fs_locking_proto.pv_major,
+			     dlm->fs_locking_proto.pv_minor,
+			     node);
+			status = -EPROTO;
+			break;
+		case JOIN_OK:
+			/* Use the same locking protocol as the remote node */
+			dlm->dlm_locking_proto.pv_minor = packet.dlm_minor;
+			dlm->fs_locking_proto.pv_minor = packet.fs_minor;
+			mlog(0,
+			     "Node %d responds JOIN_OK with DLM locking protocol "
+			     "%u.%u and fs locking protocol %u.%u\n",
+			     node,
+			     dlm->dlm_locking_proto.pv_major,
+			     dlm->dlm_locking_proto.pv_minor,
+			     dlm->fs_locking_proto.pv_major,
+			     dlm->fs_locking_proto.pv_minor);
+			break;
+		default:
+			status = -EINVAL;
+			mlog(ML_ERROR, "invalid response %d from node %u\n",
+			     packet.code, node);
+			/* Reset response to JOIN_DISALLOW */
+			*response = JOIN_DISALLOW;
+			break;
+		}
 	}
 
 	mlog(0, "status %d, node %d response is %d\n", status, node,
-- 
cgit v1.2.3


From 3d46a44a0c01b15d385ccaae24b56f619613c256 Mon Sep 17 00:00:00 2001
From: Tariq Saeed <tariq.x.saeed@oracle.com>
Date: Fri, 4 Sep 2015 15:44:31 -0700
Subject: ocfs2: fix BUG_ON() in ocfs2_ci_checkpointed()

PID: 614    TASK: ffff882a739da580  CPU: 3   COMMAND: "ocfs2dc"
  #0 [ffff882ecc3759b0] machine_kexec at ffffffff8103b35d
  #1 [ffff882ecc375a20] crash_kexec at ffffffff810b95b5
  #2 [ffff882ecc375af0] oops_end at ffffffff815091d8
  #3 [ffff882ecc375b20] die at ffffffff8101868b
  #4 [ffff882ecc375b50] do_trap at ffffffff81508bb0
  #5 [ffff882ecc375ba0] do_invalid_op at ffffffff810165e5
  #6 [ffff882ecc375c40] invalid_op at ffffffff815116fb
     [exception RIP: ocfs2_ci_checkpointed+208]
     RIP: ffffffffa0a7e940  RSP: ffff882ecc375cf0  RFLAGS: 00010002
     RAX: 0000000000000001  RBX: 000000000000654b  RCX: ffff8812dc83f1f8
     RDX: 00000000000017d9  RSI: ffff8812dc83f1f8  RDI: ffffffffa0b2c318
     RBP: ffff882ecc375d20   R8: ffff882ef6ecfa60   R9: ffff88301f272200
     R10: 0000000000000000  R11: 0000000000000000  R12: ffffffffffffffff
     R13: ffff8812dc83f4f0  R14: 0000000000000000  R15: ffff8812dc83f1f8
     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
  #7 [ffff882ecc375d28] ocfs2_check_meta_downconvert at ffffffffa0a7edbd [ocfs2]
  #8 [ffff882ecc375d38] ocfs2_unblock_lock at ffffffffa0a84af8 [ocfs2]
  #9 [ffff882ecc375dc8] ocfs2_process_blocked_lock at ffffffffa0a85285 [ocfs2]
#10 [ffff882ecc375e18] ocfs2_downconvert_thread_do_work at ffffffffa0a85445 [ocfs2]
#11 [ffff882ecc375e68] ocfs2_downconvert_thread at ffffffffa0a854de [ocfs2]
#12 [ffff882ecc375ee8] kthread at ffffffff81090da7
#13 [ffff882ecc375f48] kernel_thread_helper at ffffffff81511884
assert is tripped because the tran is not checkpointed and the lock level is PR.

Some time ago, chmod command had been executed. As result, the following call
chain left the inode cluster lock in PR state, latter on causing the assert.
system_call_fastpath
  -> my_chmod
   -> sys_chmod
    -> sys_fchmodat
     -> notify_change
      -> ocfs2_setattr
       -> posix_acl_chmod
        -> ocfs2_iop_set_acl
         -> ocfs2_set_acl
          -> ocfs2_acl_set_mode
Here is how.
1119 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1120 {
1247         ocfs2_inode_unlock(inode, 1); <<< WRONG thing to do.
..
1258         if (!status && attr->ia_valid & ATTR_MODE) {
1259                 status =  posix_acl_chmod(inode, inode->i_mode);

519 posix_acl_chmod(struct inode *inode, umode_t mode)
520 {
..
539         ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);

287 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, ...
288 {
289         return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);

224 int ocfs2_set_acl(handle_t *handle,
225                          struct inode *inode, ...
231 {
..
252                                 ret = ocfs2_acl_set_mode(inode, di_bh,
253                                                          handle, mode);

168 static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head ...
170 {
183         if (handle == NULL) {
                    >>> BUG: inode lock not held in ex at this point <<<
184                 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
185                                            OCFS2_INODE_UPDATE_CREDITS);

ocfs2_setattr.#1247 we unlock and at #1259 call posix_acl_chmod. When we reach
ocfs2_acl_set_mode.#181 and do trans, the inode cluster lock is not held in EX
mode (it should be). How this could have happended?

We are the lock master, were holding lock EX and have released it in
ocfs2_setattr.#1247.  Note that there are no holders of this lock at
this point.  Another node needs the lock in PR, and we downconvert from
EX to PR.  So the inode lock is PR when do the trans in
ocfs2_acl_set_mode.#184.  The trans stays in core (not flushed to disc).
Now another node want the lock in EX, downconvert thread gets kicked
(the one that tripped assert abovt), finds an unflushed trans but the
lock is not EX (it is PR).  If the lock was at EX, it would have flushed
the trans ocfs2_ci_checkpointed -> ocfs2_start_checkpoint before
downconverting (to NULL) for the request.

ocfs2_setattr must not drop inode lock ex in this code path.  If it
does, takes it again before the trans, say in ocfs2_set_acl, another
cluster node can get in between, execute another setattr, overwriting
the one in progress on this node, resulting in a mode acl size combo
that is a mix of the two.

Orabug: 20189959
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Joseph Qi <joseph.qi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c4a99fb61c3e..0e5b4515f92e 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1130,6 +1130,7 @@ out:
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int status = 0, size_change;
+	int inode_locked = 0;
 	struct inode *inode = d_inode(dentry);
 	struct super_block *sb = inode->i_sb;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
@@ -1178,6 +1179,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 			mlog_errno(status);
 		goto bail_unlock_rw;
 	}
+	inode_locked = 1;
 
 	if (size_change) {
 		status = inode_newsize_ok(inode, attr->ia_size);
@@ -1258,7 +1260,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
-	ocfs2_inode_unlock(inode, 1);
+	if (status) {
+		ocfs2_inode_unlock(inode, 1);
+		inode_locked = 0;
+	}
 bail_unlock_rw:
 	if (size_change)
 		ocfs2_rw_unlock(inode, 1);
@@ -1274,6 +1279,8 @@ bail:
 		if (status < 0)
 			mlog_errno(status);
 	}
+	if (inode_locked)
+		ocfs2_inode_unlock(inode, 1);
 
 	return status;
 }
-- 
cgit v1.2.3


From 743b5f1434f57a147226c747fe228cadeb7b05ed Mon Sep 17 00:00:00 2001
From: Tariq Saeed <tariq.x.saeed@oracle.com>
Date: Fri, 4 Sep 2015 15:44:34 -0700
Subject: ocfs2: take inode lock in ocfs2_iop_set/get_acl()

This bug in mainline code is pointed out by Mark Fasheh.  When
ocfs2_iop_set_acl() and ocfs2_iop_get_acl() are entered from VFS layer,
inode lock is not held.  This seems to be regression from older kernels.
The patch is to fix that.

Orabug: 20189959
Signed-off-by: Tariq Saeed <tariq.x.saeed@oracle.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/acl.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index c58a1bcfda0f..0cdf497c91ef 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -284,7 +284,19 @@ int ocfs2_set_acl(handle_t *handle,
 
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
-	return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+	struct buffer_head *bh = NULL;
+	int status = 0;
+
+	status = ocfs2_inode_lock(inode, &bh, 1);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		return status;
+	}
+	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
+	ocfs2_inode_unlock(inode, 1);
+	brelse(bh);
+	return status;
 }
 
 struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
@@ -292,19 +304,21 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 	struct ocfs2_super *osb;
 	struct buffer_head *di_bh = NULL;
 	struct posix_acl *acl;
-	int ret = -EAGAIN;
+	int ret;
 
 	osb = OCFS2_SB(inode->i_sb);
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
 		return NULL;
-
-	ret = ocfs2_read_inode_block(inode, &di_bh);
-	if (ret < 0)
+	ret = ocfs2_inode_lock(inode, &di_bh, 0);
+	if (ret < 0) {
+		if (ret != -ENOENT)
+			mlog_errno(ret);
 		return ERR_PTR(ret);
+	}
 
 	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
 
+	ocfs2_inode_unlock(inode, 0);
 	brelse(di_bh);
-
 	return acl;
 }
-- 
cgit v1.2.3


From f57a22ddecd6f26040a67e2c12880f98f88b6e00 Mon Sep 17 00:00:00 2001
From: Yiwen Jiang <jiangyiwen@huawei.com>
Date: Fri, 4 Sep 2015 15:44:37 -0700
Subject: ocfs2: avoid access invalid address when read o2dlm debug messages

The following case will lead to a lockres is freed but is still in use.

cat /sys/kernel/debug/o2dlm/locking_state	dlm_thread
lockres_seq_start
    -> lock dlm->track_lock
    -> get resA
                                                resA->refs decrease to 0,
                                                call dlm_lockres_release,
                                                and wait for "cat" unlock.
Although resA->refs is already set to 0,
increase resA->refs, and then unlock
                                                lock dlm->track_lock
                                                    -> list_del_init()
                                                    -> unlock
                                                    -> free resA

In such a race case, invalid address access may occurs.  So we should
delete list res->tracking before resA->refs decrease to 0.

Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Reviewed-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmmaster.c | 22 +++++++++++-----------
 fs/ocfs2/dlm/dlmthread.c | 10 ++++++++++
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index fdf4b41d0609..46b8b2bbc95a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -498,16 +498,6 @@ static void dlm_lockres_release(struct kref *kref)
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
-	spin_lock(&dlm->track_lock);
-	if (!list_empty(&res->tracking))
-		list_del_init(&res->tracking);
-	else {
-		mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
-		     res->lockname.len, res->lockname.name);
-		dlm_print_one_lock_resource(res);
-	}
-	spin_unlock(&dlm->track_lock);
-
 	atomic_dec(&dlm->res_cur_count);
 
 	if (!hlist_unhashed(&res->hash_node) ||
@@ -795,8 +785,18 @@ lookup:
 		dlm_lockres_grab_inflight_ref(dlm, tmpres);
 
 		spin_unlock(&tmpres->spinlock);
-		if (res)
+		if (res) {
+			spin_lock(&dlm->track_lock);
+			if (!list_empty(&res->tracking))
+				list_del_init(&res->tracking);
+			else
+				mlog(ML_ERROR, "Resource %.*s not "
+						"on the Tracking list\n",
+						res->lockname.len,
+						res->lockname.name);
+			spin_unlock(&dlm->track_lock);
 			dlm_lockres_put(res);
+		}
 		res = tmpres;
 		goto leave;
 	}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 69aac6f088ad..2e5e6d5fffe8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -211,6 +211,16 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 	__dlm_unhash_lockres(dlm, res);
 
+	spin_lock(&dlm->track_lock);
+	if (!list_empty(&res->tracking))
+		list_del_init(&res->tracking);
+	else {
+		mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
+				res->lockname.len, res->lockname.name);
+		__dlm_print_one_lock_resource(res);
+	}
+	spin_unlock(&dlm->track_lock);
+
 	/* lockres is not in the hash now.  drop the flag and wake up
 	 * any processes waiting in dlm_get_lock_resource. */
 	if (!master) {
-- 
cgit v1.2.3


From ad694821224634d46b6571f0161e85ac2e397396 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:44:40 -0700
Subject: ocfs2: fix race between crashed dio and rm

There is a race case between crashed dio and rm, which will lead to
OCFS2_VALID_FL not set read-only.

  N1                              N2
  ------------------------------------------------------------------------
  dd with direct flag
                                  rm file
  crashed with an dio entry left
  in orphan dir
                                  clear OCFS2_VALID_FL in
                                  ocfs2_remove_inode
                                  recover N1 and read the corrupted inode,
                                  and set filesystem read-only

So we skip the inode deletion this time and wait for dio entry recovered
first.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/inode.c   | 9 +++++++++
 fs/ocfs2/journal.c | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7868f7e7c455..fe4b3f7db245 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -971,6 +971,7 @@ static void ocfs2_delete_inode(struct inode *inode)
 	int wipe, status;
 	sigset_t oldset;
 	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di = NULL;
 
 	trace_ocfs2_delete_inode(inode->i_ino,
 				 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1025,6 +1026,14 @@ static void ocfs2_delete_inode(struct inode *inode)
 		goto bail_unlock_nfs_sync;
 	}
 
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+	/* Skip inode deletion and wait for dio orphan entry recovered
+	 * first */
+	if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+		ocfs2_cleanup_delete_inode(inode, 0);
+		goto bail_unlock_inode;
+	}
+
 	/* Query the cluster. This will be the final decision made
 	 * before we go ahead and wipe the inode. */
 	status = ocfs2_query_inode_wipe(inode, di_bh, &wipe);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 3bfd36a23e40..52948af646b6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2210,7 +2210,9 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 			 * ocfs2_delete_inode. */
 			oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
 			spin_unlock(&oi->ip_lock);
-		} else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
+		}
+
+		if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
 				(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
 			ret = ocfs2_truncate_file(inode, di_bh,
 					i_size_read(inode));
-- 
cgit v1.2.3


From 40476b8294466d40e7db57b4cbf69a831a4486b8 Mon Sep 17 00:00:00 2001
From: Tina Ruchandani <ruchandani.tina@gmail.com>
Date: Fri, 4 Sep 2015 15:44:43 -0700
Subject: ocfs2: use 64bit variables to track heartbeat time

o2hb_elapsed_msecs computes the time taken for a disk heartbeat.
'struct timeval' variables are used to store start and end times.  On
32-bit systems, the 'tv_sec' component of 'struct timeval' will overflow
in year 2038 and beyond.

This patch solves the overflow with the following:

1. Replace o2hb_elapsed_msecs using 'ktime_t' values to measure start
   and end time, and built-in function 'ktime_ms_delta' to compute the
   elapsed time.  ktime_get_real() is used since the code prints out the
   wallclock time.

2. Changes format string to print time as a single 64-bit nanoseconds
   value ("%lld") instead of seconds and microseconds.  This simplifies
   the code since converting ktime_t to that format would need expensive
   computation.  However, the debug log string is less readable than the
   previous format.

Signed-off-by: Tina Ruchandani <ruchandani.tina@gmail.com>
Suggested by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 49 ++++++++------------------------------------
 1 file changed, 9 insertions(+), 40 deletions(-)

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f97306453a0b..fa15debcc02b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -36,7 +36,7 @@
 #include <linux/debugfs.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
-
+#include <linux/ktime.h>
 #include "heartbeat.h"
 #include "tcp.h"
 #include "nodemanager.h"
@@ -1060,37 +1060,6 @@ bail:
 	return ret;
 }
 
-/* Subtract b from a, storing the result in a. a *must* have a larger
- * value than b. */
-static void o2hb_tv_subtract(struct timeval *a,
-			     struct timeval *b)
-{
-	/* just return 0 when a is after b */
-	if (a->tv_sec < b->tv_sec ||
-	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
-		a->tv_sec = 0;
-		a->tv_usec = 0;
-		return;
-	}
-
-	a->tv_sec -= b->tv_sec;
-	a->tv_usec -= b->tv_usec;
-	while ( a->tv_usec < 0 ) {
-		a->tv_sec--;
-		a->tv_usec += 1000000;
-	}
-}
-
-static unsigned int o2hb_elapsed_msecs(struct timeval *start,
-				       struct timeval *end)
-{
-	struct timeval res = *end;
-
-	o2hb_tv_subtract(&res, start);
-
-	return res.tv_sec * 1000 + res.tv_usec / 1000;
-}
-
 /*
  * we ride the region ref that the region dir holds.  before the region
  * dir is removed and drops it ref it will wait to tear down this
@@ -1101,7 +1070,7 @@ static int o2hb_thread(void *data)
 	int i, ret;
 	struct o2hb_region *reg = data;
 	struct o2hb_bio_wait_ctxt write_wc;
-	struct timeval before_hb, after_hb;
+	ktime_t before_hb, after_hb;
 	unsigned int elapsed_msec;
 
 	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
@@ -1118,18 +1087,18 @@ static int o2hb_thread(void *data)
 		 * hr_timeout_ms between disk writes. On busy systems
 		 * this should result in a heartbeat which is less
 		 * likely to time itself out. */
-		do_gettimeofday(&before_hb);
+		before_hb = ktime_get_real();
 
 		ret = o2hb_do_disk_heartbeat(reg);
 
-		do_gettimeofday(&after_hb);
-		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
+		after_hb = ktime_get_real();
+
+		elapsed_msec = (unsigned int)
+				ktime_ms_delta(after_hb, before_hb);
 
 		mlog(ML_HEARTBEAT,
-		     "start = %lu.%lu, end = %lu.%lu, msec = %u, ret = %d\n",
-		     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
-		     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
-		     elapsed_msec, ret);
+		     "start = %lld, end = %lld, msec = %u, ret = %d\n",
+		     before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
 
 		if (!kthread_should_stop() &&
 		    elapsed_msec < reg->hr_timeout_ms) {
-- 
cgit v1.2.3


From 7f27ec978b0ef37391262bbf15c587fd8526e268 Mon Sep 17 00:00:00 2001
From: yangwenfang <vicky.yangwenfang@huawei.com>
Date: Fri, 4 Sep 2015 15:44:45 -0700
Subject: ocfs2: call ocfs2_journal_access_di() before ocfs2_journal_dirty() in
 ocfs2_write_end_nolock()

1: After we call ocfs2_journal_access_di() in ocfs2_write_begin(),
   jbd2_journal_restart() may also be called, in this function transaction
   A's t_updates-- and obtains a new transaction B.  If
   jbd2_journal_commit_transaction() is happened to commit transaction A,
   when t_updates==0, it will continue to complete commit and unfile
   buffer.

   So when jbd2_journal_dirty_metadata(), the handle is pointed a new
   transaction B, and the buffer head's journal head is already freed,
   jh->b_transaction == NULL, jh->b_next_transaction == NULL, it returns
   EINVAL, So it triggers the BUG_ON(status).

thread 1                                          jbd2
ocfs2_write_begin                     jbd2_journal_commit_transaction
ocfs2_write_begin_nolock
  ocfs2_start_trans
    jbd2__journal_start(t_updates+1,
                       transaction A)
    ocfs2_journal_access_di
    ocfs2_write_cluster_by_desc
      ocfs2_mark_extent_written
        ocfs2_change_extent_flag
          ocfs2_split_extent
            ocfs2_extend_rotate_transaction
              jbd2_journal_restart
              (t_updates-1,transaction B) t_updates==0
                                        __jbd2_journal_refile_buffer
                                        (jh->b_transaction = NULL)
ocfs2_write_end
ocfs2_write_end_nolock
    ocfs2_journal_dirty
        jbd2_journal_dirty_metadata(bug)
   ocfs2_commit_trans

2.  In ext4, I found that: jbd2_journal_get_write_access() called by
   ext4_write_end.

ext4_write_begin
    ext4_journal_start
        __ext4_journal_start_sb
            ext4_journal_check_start
            jbd2__journal_start

ext4_write_end
    ext4_mark_inode_dirty
        ext4_reserve_inode_write
            ext4_journal_get_write_access
                jbd2_journal_get_write_access
        ext4_mark_iloc_dirty
            ext4_do_update_inode
                ext4_handle_dirty_metadata
                    jbd2_journal_dirty_metadata

3. So I think we should put ocfs2_journal_access_di before
   ocfs2_journal_dirty in the ocfs2_write_end.  and it works well after my
   modification.

Signed-off-by: vicky <vicky.yangwenfang@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Zhangguanghui <zhang.guanghui@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a7ab145e2901..faf36a96cd19 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2207,10 +2207,7 @@ try_again:
 		if (ret)
 			goto out_commit;
 	}
-	/*
-	 * We don't want this to fail in ocfs2_write_end(), so do it
-	 * here.
-	 */
+
 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
 				      OCFS2_JOURNAL_ACCESS_WRITE);
 	if (ret) {
@@ -2367,7 +2364,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata)
 {
-	int i;
+	int i, ret;
 	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -2376,6 +2373,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 	handle_t *handle = wc->w_handle;
 	struct page *tmppage;
 
+	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
+			OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		copied = ret;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		ocfs2_write_end_inline(inode, pos, len, &copied, di, wc);
 		goto out_write_size;
@@ -2431,6 +2436,7 @@ out_write_size:
 	ocfs2_update_inode_fsync_trans(handle, inode, 1);
 	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
+out:
 	/* unlock pages before dealloc since it needs acquiring j_trans_barrier
 	 * lock, or it will cause a deadlock since journal commit threads holds
 	 * this lock and will ask for the page lock when flushing the data.
-- 
cgit v1.2.3


From d0c97d52f5e1de125394d748be7bd5763fd9ed9e Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Fri, 4 Sep 2015 15:44:48 -0700
Subject: ocfs2: do not set fs read-only if rec[0] is empty while committing
 truncate

While appending an extent to a file, it will call these functions:
ocfs2_insert_extent

  -> call ocfs2_grow_tree() if there's no free rec
     -> ocfs2_add_branch add a new branch to extent tree,
        now rec[0] in the leaf of rightmost path is empty
  -> ocfs2_do_insert_extent
     -> ocfs2_rotate_tree_right
       -> ocfs2_extend_rotate_transaction
          -> jbd2_journal_restart if jbd2_journal_extend fail
     -> ocfs2_insert_path
        -> ocfs2_extend_trans
          -> jbd2_journal_restart if jbd2_journal_extend fail
        -> ocfs2_insert_at_leaf
     -> ocfs2_et_update_clusters
Function jbd2_journal_restart() may be called and it may happened that
buffers dirtied in ocfs2_add_branch() are committed
while buffers dirtied in ocfs2_insert_at_leaf() and
ocfs2_et_update_clusters() are not.
So an empty rec[0] is left in rightmost path which will cause
read-only filesystem when call ocfs2_commit_truncate()
with the error message: "Inode %lu has an empty extent record".

This is not a serious problem, so remove the rightmost path when call
ocfs2_commit_truncate().

Signed-off-by: joyce.xue <xuejiufei@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9a0fd494fe74..77cbd1e3c950 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3131,6 +3131,30 @@ out:
 	return ret;
 }
 
+static int ocfs2_remove_rightmost_empty_extent(struct ocfs2_super *osb,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_path *path,
+				struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	handle_t *handle;
+	int ret;
+	int credits = path->p_tree_depth * 2 + 1;
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	ret = ocfs2_remove_rightmost_path(handle, et, path, dealloc);
+	if (ret)
+		mlog_errno(ret);
+
+	ocfs2_commit_trans(osb, handle);
+	return ret;
+}
+
 /*
  * Left rotation of btree records.
  *
@@ -7108,15 +7132,23 @@ start:
 		 * to check it up here before changing the tree.
 		*/
 		if (root_el->l_tree_depth && rec->e_int_clusters == 0) {
-			ocfs2_error(inode->i_sb, "Inode %lu has an empty "
+			mlog(ML_ERROR, "Inode %lu has an empty "
 				    "extent record, depth %u\n", inode->i_ino,
 				    le16_to_cpu(root_el->l_tree_depth));
-			status = -EROFS;
-			goto bail;
+			status = ocfs2_remove_rightmost_empty_extent(osb,
+					&et, path, &dealloc);
+			if (status) {
+				mlog_errno(status);
+				goto bail;
+			}
+
+			ocfs2_reinit_path(path, 1);
+			goto start;
+		} else {
+			trunc_cpos = le32_to_cpu(rec->e_cpos);
+			trunc_len = 0;
+			blkno = 0;
 		}
-		trunc_cpos = le32_to_cpu(rec->e_cpos);
-		trunc_len = 0;
-		blkno = 0;
 	} else if (le32_to_cpu(rec->e_cpos) >= new_highest_cpos) {
 		/*
 		 * Truncate entire record.
-- 
cgit v1.2.3


From 7ecef14ab1db961545354fa443749aeda2ea1b75 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 4 Sep 2015 15:44:51 -0700
Subject: ocfs2: neaten do_error, ocfs2_error and ocfs2_abort

These uses sometimes do and sometimes don't have '\n' terminations.  Make
the uses consistently use '\n' terminations and remove the newline from
the functions.

Miscellanea:

o Coalesce formats
o Realign arguments

Signed-off-by: Joe Perches <joe@perches.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c        | 86 ++++++++++++++++++-------------------------------
 fs/ocfs2/aops.c         |  4 +--
 fs/ocfs2/dir.c          | 49 +++++++++++++---------------
 fs/ocfs2/extent_map.c   | 22 +++++++------
 fs/ocfs2/inode.c        | 18 +++++------
 fs/ocfs2/journal.c      |  2 +-
 fs/ocfs2/localalloc.c   |  3 +-
 fs/ocfs2/move_extents.c |  5 ++-
 fs/ocfs2/quota_local.c  |  3 +-
 fs/ocfs2/refcounttree.c | 53 +++++++++++++-----------------
 fs/ocfs2/suballoc.c     | 75 ++++++++++++++++++++----------------------
 fs/ocfs2/super.c        |  4 +--
 fs/ocfs2/super.h        |  6 ++--
 fs/ocfs2/xattr.c        | 35 +++++++++-----------
 14 files changed, 163 insertions(+), 202 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 77cbd1e3c950..b20706e8a4d1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -909,27 +909,25 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
 
 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
 		rc = ocfs2_error(sb,
-			    "Extent block #%llu has bad signature %.*s",
-			    (unsigned long long)bh->b_blocknr, 7,
-			    eb->h_signature);
+				 "Extent block #%llu has bad signature %.*s\n",
+				 (unsigned long long)bh->b_blocknr, 7,
+				 eb->h_signature);
 		goto bail;
 	}
 
 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
 		rc = ocfs2_error(sb,
-			    "Extent block #%llu has an invalid h_blkno "
-			    "of %llu",
-			    (unsigned long long)bh->b_blocknr,
-			    (unsigned long long)le64_to_cpu(eb->h_blkno));
+				 "Extent block #%llu has an invalid h_blkno of %llu\n",
+				 (unsigned long long)bh->b_blocknr,
+				 (unsigned long long)le64_to_cpu(eb->h_blkno));
 		goto bail;
 	}
 
 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
 		rc = ocfs2_error(sb,
-			    "Extent block #%llu has an invalid "
-			    "h_fs_generation of #%u",
-			    (unsigned long long)bh->b_blocknr,
-			    le32_to_cpu(eb->h_fs_generation));
+				 "Extent block #%llu has an invalid h_fs_generation of #%u\n",
+				 (unsigned long long)bh->b_blocknr,
+				 le32_to_cpu(eb->h_fs_generation));
 		goto bail;
 	}
 bail:
@@ -1446,8 +1444,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
 	while(le16_to_cpu(el->l_tree_depth) > 1) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has empty "
-				    "extent list (next_free_rec == 0)",
+				    "Owner %llu has empty extent list (next_free_rec == 0)\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
 			status = -EIO;
 			goto bail;
@@ -1456,9 +1453,7 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
 		if (!blkno) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has extent "
-				    "list where extent # %d has no physical "
-				    "block start",
+				    "Owner %llu has extent list where extent # %d has no physical block start\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
 			status = -EIO;
 			goto bail;
@@ -1788,8 +1783,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
 	while (el->l_tree_depth) {
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-				    "Owner %llu has empty extent list at "
-				    "depth %u\n",
+				    "Owner %llu has empty extent list at depth %u\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
 				    le16_to_cpu(el->l_tree_depth));
 			ret = -EROFS;
@@ -1814,8 +1808,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
 		if (blkno == 0) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-				    "Owner %llu has bad blkno in extent list "
-				    "at depth %u (index %d)\n",
+				    "Owner %llu has bad blkno in extent list at depth %u (index %d)\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
 				    le16_to_cpu(el->l_tree_depth), i);
 			ret = -EROFS;
@@ -1836,8 +1829,7 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
 		if (le16_to_cpu(el->l_next_free_rec) >
 		    le16_to_cpu(el->l_count)) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
-				    "Owner %llu has bad count in extent list "
-				    "at block %llu (next free=%u, count=%u)\n",
+				    "Owner %llu has bad count in extent list at block %llu (next free=%u, count=%u)\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
 				    (unsigned long long)bh->b_blocknr,
 				    le16_to_cpu(el->l_next_free_rec),
@@ -2116,8 +2108,7 @@ static int ocfs2_rotate_subtree_right(handle_t *handle,
 
 	if (left_el->l_next_free_rec != left_el->l_count) {
 		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-			    "Inode %llu has non-full interior leaf node %llu"
-			    "(next free = %u)",
+			    "Inode %llu has non-full interior leaf node %llu (next free = %u)\n",
 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 			    (unsigned long long)left_leaf_bh->b_blocknr,
 			    le16_to_cpu(left_el->l_next_free_rec));
@@ -2256,8 +2247,7 @@ int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
 		 * If we got here, we never found a valid node where
 		 * the tree indicated one should be.
 		 */
-		ocfs2_error(sb,
-			    "Invalid extent tree at extent block %llu\n",
+		ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
 			    (unsigned long long)blkno);
 		ret = -EROFS;
 		goto out;
@@ -2872,8 +2862,7 @@ int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
 		 * If we got here, we never found a valid node where
 		 * the tree indicated one should be.
 		 */
-		ocfs2_error(sb,
-			    "Invalid extent tree at extent block %llu\n",
+		ocfs2_error(sb, "Invalid extent tree at extent block %llu\n",
 			    (unsigned long long)blkno);
 		ret = -EROFS;
 		goto out;
@@ -3224,7 +3213,7 @@ rightmost_no_delete:
 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
 			ret = -EIO;
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has empty extent block at %llu",
+				    "Owner %llu has empty extent block at %llu\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
 			goto out;
@@ -3954,7 +3943,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle,
 		next_free = le16_to_cpu(el->l_next_free_rec);
 		if (next_free == 0) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu has a bad extent list",
+				    "Owner %llu has a bad extent list\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
 			ret = -EIO;
 			return;
@@ -4379,10 +4368,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 				bh = path_leaf_bh(left_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
 				ocfs2_error(sb,
-					    "Extent block #%llu has an "
-					    "invalid l_next_free_rec of "
-					    "%d.  It should have "
-					    "matched the l_count of %d",
+					    "Extent block #%llu has an invalid l_next_free_rec of %d.  It should have matched the l_count of %d\n",
 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
 					    le16_to_cpu(new_el->l_next_free_rec),
 					    le16_to_cpu(new_el->l_count));
@@ -4437,8 +4423,7 @@ static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
 				bh = path_leaf_bh(right_path);
 				eb = (struct ocfs2_extent_block *)bh->b_data;
 				ocfs2_error(sb,
-					    "Extent block #%llu has an "
-					    "invalid l_next_free_rec of %d",
+					    "Extent block #%llu has an invalid l_next_free_rec of %d\n",
 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
 					    le16_to_cpu(new_el->l_next_free_rec));
 				status = -EINVAL;
@@ -4994,10 +4979,9 @@ leftright:
 		split_index = ocfs2_search_extent_list(el, cpos);
 		if (split_index == -1) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-					"Owner %llu has an extent at cpos %u "
-					"which can no longer be found.\n",
-					(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-					cpos);
+				    "Owner %llu has an extent at cpos %u which can no longer be found\n",
+				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+				    cpos);
 			ret = -EROFS;
 			goto out;
 		}
@@ -5182,10 +5166,9 @@ int ocfs2_change_extent_flag(handle_t *handle,
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1) {
 		ocfs2_error(sb,
-			    "Owner %llu has an extent at cpos %u which can no "
-			    "longer be found.\n",
-			     (unsigned long long)
-			     ocfs2_metadata_cache_owner(et->et_ci), cpos);
+			    "Owner %llu has an extent at cpos %u which can no longer be found\n",
+			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+			    cpos);
 		ret = -EROFS;
 		goto out;
 	}
@@ -5252,9 +5235,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
 		cpos, len, phys);
 
 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
-		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
-			    "that are being written to, but the feature bit "
-			    "is not set in the super block.",
+		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents that are being written to, but the feature bit is not set in the super block\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
 		ret = -EROFS;
 		goto out;
@@ -5538,8 +5519,7 @@ int ocfs2_remove_extent(handle_t *handle,
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1) {
 		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-			    "Owner %llu has an extent at cpos %u which can no "
-			    "longer be found.\n",
+			    "Owner %llu has an extent at cpos %u which can no longer be found\n",
 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 			    cpos);
 		ret = -EROFS;
@@ -5604,7 +5584,7 @@ int ocfs2_remove_extent(handle_t *handle,
 		index = ocfs2_search_extent_list(el, cpos);
 		if (index == -1) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu: split at cpos %u lost record.",
+				    "Owner %llu: split at cpos %u lost record\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 				    cpos);
 			ret = -EROFS;
@@ -5620,8 +5600,7 @@ int ocfs2_remove_extent(handle_t *handle,
 			ocfs2_rec_clusters(el, rec);
 		if (rec_range != trunc_range) {
 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
-				    "Owner %llu: error after split at cpos %u"
-				    "trunc len %u, existing record is (%u,%u)",
+				    "Owner %llu: error after split at cpos %u trunc len %u, existing record is (%u,%u)\n",
 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 				    cpos, len, le32_to_cpu(rec->e_cpos),
 				    ocfs2_rec_clusters(el, rec));
@@ -7236,8 +7215,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
 	    !ocfs2_supports_inline_data(osb)) {
 		ocfs2_error(inode->i_sb,
-			    "Inline data flags for inode %llu don't agree! "
-			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
+			    "Inline data flags for inode %llu don't agree! Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    le16_to_cpu(di->i_dyn_features),
 			    OCFS2_I(inode)->ip_dyn_features,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index faf36a96cd19..64b11d90eca6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	if (!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL)) {
-		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag",
+		ocfs2_error(inode->i_sb, "Inode %llu lost inline data flag\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
 		return -EROFS;
 	}
@@ -237,7 +237,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	if (size > PAGE_CACHE_SIZE ||
 	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
 		ocfs2_error(inode->i_sb,
-			    "Inode %llu has with inline data has bad size: %Lu",
+			    "Inode %llu has with inline data has bad size: %Lu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    (unsigned long long)size);
 		return -EROFS;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 25f03af09237..ffecf89c8c1c 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -481,29 +481,25 @@ static int ocfs2_check_dir_trailer(struct inode *dir, struct buffer_head *bh)
 	trailer = ocfs2_trailer_from_bh(bh, dir->i_sb);
 	if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
 		rc = ocfs2_error(dir->i_sb,
-			    "Invalid dirblock #%llu: "
-			    "signature = %.*s\n",
-			    (unsigned long long)bh->b_blocknr, 7,
-			    trailer->db_signature);
+				 "Invalid dirblock #%llu: signature = %.*s\n",
+				 (unsigned long long)bh->b_blocknr, 7,
+				 trailer->db_signature);
 		goto out;
 	}
 	if (le64_to_cpu(trailer->db_blkno) != bh->b_blocknr) {
 		rc = ocfs2_error(dir->i_sb,
-			    "Directory block #%llu has an invalid "
-			    "db_blkno of %llu",
-			    (unsigned long long)bh->b_blocknr,
-			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+				 "Directory block #%llu has an invalid db_blkno of %llu\n",
+				 (unsigned long long)bh->b_blocknr,
+				 (unsigned long long)le64_to_cpu(trailer->db_blkno));
 		goto out;
 	}
 	if (le64_to_cpu(trailer->db_parent_dinode) !=
 	    OCFS2_I(dir)->ip_blkno) {
 		rc = ocfs2_error(dir->i_sb,
-			    "Directory block #%llu on dinode "
-			    "#%llu has an invalid parent_dinode "
-			    "of %llu",
-			    (unsigned long long)bh->b_blocknr,
-			    (unsigned long long)OCFS2_I(dir)->ip_blkno,
-			    (unsigned long long)le64_to_cpu(trailer->db_blkno));
+				 "Directory block #%llu on dinode #%llu has an invalid parent_dinode of %llu\n",
+				 (unsigned long long)bh->b_blocknr,
+				 (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				 (unsigned long long)le64_to_cpu(trailer->db_blkno));
 		goto out;
 	}
 out:
@@ -602,9 +598,9 @@ static int ocfs2_validate_dx_root(struct super_block *sb,
 
 	if (!OCFS2_IS_VALID_DX_ROOT(dx_root)) {
 		ret = ocfs2_error(sb,
-			    "Dir Index Root # %llu has bad signature %.*s",
-			    (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
-			    7, dx_root->dr_signature);
+				  "Dir Index Root # %llu has bad signature %.*s\n",
+				  (unsigned long long)le64_to_cpu(dx_root->dr_blkno),
+				  7, dx_root->dr_signature);
 	}
 
 	return ret;
@@ -644,8 +640,8 @@ static int ocfs2_validate_dx_leaf(struct super_block *sb,
 	}
 
 	if (!OCFS2_IS_VALID_DX_LEAF(dx_leaf)) {
-		ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s",
-			    7, dx_leaf->dl_signature);
+		ret = ocfs2_error(sb, "Dir Index Leaf has bad signature %.*s\n",
+				  7, dx_leaf->dl_signature);
 	}
 
 	return ret;
@@ -808,9 +804,9 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
 
 		if (el->l_tree_depth) {
 			ret = ocfs2_error(inode->i_sb,
-				    "Inode %lu has non zero tree depth in "
-				    "btree tree block %llu\n", inode->i_ino,
-				    (unsigned long long)eb_bh->b_blocknr);
+					  "Inode %lu has non zero tree depth in btree tree block %llu\n",
+					  inode->i_ino,
+					  (unsigned long long)eb_bh->b_blocknr);
 			goto out;
 		}
 	}
@@ -826,10 +822,11 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
 	}
 
 	if (!found) {
-		ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-			    "record (%u, %u, 0) in btree", inode->i_ino,
-			    le32_to_cpu(rec->e_cpos),
-			    ocfs2_rec_clusters(el, rec));
+		ret = ocfs2_error(inode->i_sb,
+				  "Inode %lu has bad extent record (%u, %u, 0) in btree\n",
+				  inode->i_ino,
+				  le32_to_cpu(rec->e_cpos),
+				  ocfs2_rec_clusters(el, rec));
 		goto out;
 	}
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 767370b656ca..e4719e0a3f99 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -305,8 +305,8 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
 
 	if (el->l_tree_depth) {
 		ocfs2_error(inode->i_sb,
-			    "Inode %lu has non zero tree depth in "
-			    "leaf block %llu\n", inode->i_ino,
+			    "Inode %lu has non zero tree depth in leaf block %llu\n",
+			    inode->i_ino,
 			    (unsigned long long)eb_bh->b_blocknr);
 		ret = -EROFS;
 		goto out;
@@ -441,8 +441,8 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 
 		if (el->l_tree_depth) {
 			ocfs2_error(inode->i_sb,
-				    "Inode %lu has non zero tree depth in "
-				    "leaf block %llu\n", inode->i_ino,
+				    "Inode %lu has non zero tree depth in leaf block %llu\n",
+				    inode->i_ino,
 				    (unsigned long long)eb_bh->b_blocknr);
 			ret = -EROFS;
 			goto out;
@@ -475,8 +475,9 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
 	BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
 	if (!rec->e_blkno) {
-		ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-			    "record (%u, %u, 0)", inode->i_ino,
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has bad extent record (%u, %u, 0)\n",
+			    inode->i_ino,
 			    le32_to_cpu(rec->e_cpos),
 			    ocfs2_rec_clusters(el, rec));
 		ret = -EROFS;
@@ -564,8 +565,8 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 
 		if (el->l_tree_depth) {
 			ocfs2_error(inode->i_sb,
-				    "Inode %lu has non zero tree depth in "
-				    "xattr leaf block %llu\n", inode->i_ino,
+				    "Inode %lu has non zero tree depth in xattr leaf block %llu\n",
+				    inode->i_ino,
 				    (unsigned long long)eb_bh->b_blocknr);
 			ret = -EROFS;
 			goto out;
@@ -582,8 +583,9 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
 		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
 		if (!rec->e_blkno) {
-			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-				    "record (%u, %u, 0) in xattr", inode->i_ino,
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+				    inode->i_ino,
 				    le32_to_cpu(rec->e_cpos),
 				    ocfs2_rec_clusters(el, rec));
 			ret = -EROFS;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index fe4b3f7db245..8f87e05ee25d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1362,31 +1362,31 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 
 	if (!OCFS2_IS_VALID_DINODE(di)) {
 		rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
-			    (unsigned long long)bh->b_blocknr, 7,
-			    di->i_signature);
+				 (unsigned long long)bh->b_blocknr, 7,
+				 di->i_signature);
 		goto bail;
 	}
 
 	if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
 		rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
-			    (unsigned long long)bh->b_blocknr,
-			    (unsigned long long)le64_to_cpu(di->i_blkno));
+				 (unsigned long long)bh->b_blocknr,
+				 (unsigned long long)le64_to_cpu(di->i_blkno));
 		goto bail;
 	}
 
 	if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
 		rc = ocfs2_error(sb,
-			    "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
-			    (unsigned long long)bh->b_blocknr);
+				 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+				 (unsigned long long)bh->b_blocknr);
 		goto bail;
 	}
 
 	if (le32_to_cpu(di->i_fs_generation) !=
 	    OCFS2_SB(sb)->fs_generation) {
 		rc = ocfs2_error(sb,
-			    "Invalid dinode #%llu: fs_generation is %u\n",
-			    (unsigned long long)bh->b_blocknr,
-			    le32_to_cpu(di->i_fs_generation));
+				 "Invalid dinode #%llu: fs_generation is %u\n",
+				 (unsigned long long)bh->b_blocknr,
+				 le32_to_cpu(di->i_fs_generation));
 		goto bail;
 	}
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 52948af646b6..ff82b28462a6 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -374,7 +374,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
 		mlog_errno(PTR_ERR(handle));
 
 		if (is_journal_aborted(journal)) {
-			ocfs2_abort(osb->sb, "Detected aborted journal");
+			ocfs2_abort(osb->sb, "Detected aborted journal\n");
 			handle = ERR_PTR(-EROFS);
 		}
 	} else {
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 857bbbcd39f3..0a4457fb0711 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -665,8 +665,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
 	    ocfs2_local_alloc_count_bits(alloc)) {
-		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
-			    "%u used bits, but a count shows %u",
+		ocfs2_error(osb->sb, "local alloc inode %llu says it has %u used bits, but a count shows %u\n",
 			    (unsigned long long)le64_to_cpu(alloc->i_blkno),
 			    le32_to_cpu(alloc->id1.bitmap1.i_used),
 			    ocfs2_local_alloc_count_bits(alloc));
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 70dd0ec7b7e9..124471d26a73 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -100,9 +100,8 @@ static int __ocfs2_move_extent(handle_t *handle,
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1) {
 		ret = ocfs2_error(inode->i_sb,
-			    "Inode %llu has an extent at cpos %u which can no "
-			    "longer be found.\n",
-			    (unsigned long long)ino, cpos);
+				  "Inode %llu has an extent at cpos %u which can no longer be found\n",
+				  (unsigned long long)ino, cpos);
 		goto out;
 	}
 
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index bb07004df72a..8a54fd8a4fa5 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -138,8 +138,7 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
 
 	if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
 		ocfs2_error(inode->i_sb,
-			    "Quota file %llu is probably corrupted! Requested "
-			    "to read block %Lu but file has size only %Lu\n",
+			    "Quota file %llu is probably corrupted! Requested to read block %Lu but file has size only %Lu\n",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
 			    (unsigned long long)v_block,
 			    (unsigned long long)i_size_read(inode));
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b404dbde3fe4..e5d57cd32505 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -103,27 +103,25 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
 
 	if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
 		rc = ocfs2_error(sb,
-			    "Refcount block #%llu has bad signature %.*s",
-			    (unsigned long long)bh->b_blocknr, 7,
-			    rb->rf_signature);
+				 "Refcount block #%llu has bad signature %.*s\n",
+				 (unsigned long long)bh->b_blocknr, 7,
+				 rb->rf_signature);
 		goto out;
 	}
 
 	if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
 		rc = ocfs2_error(sb,
-			    "Refcount block #%llu has an invalid rf_blkno "
-			    "of %llu",
-			    (unsigned long long)bh->b_blocknr,
-			    (unsigned long long)le64_to_cpu(rb->rf_blkno));
+				 "Refcount block #%llu has an invalid rf_blkno of %llu\n",
+				 (unsigned long long)bh->b_blocknr,
+				 (unsigned long long)le64_to_cpu(rb->rf_blkno));
 		goto out;
 	}
 
 	if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
 		rc = ocfs2_error(sb,
-			    "Refcount block #%llu has an invalid "
-			    "rf_fs_generation of #%u",
-			    (unsigned long long)bh->b_blocknr,
-			    le32_to_cpu(rb->rf_fs_generation));
+				 "Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
+				 (unsigned long long)bh->b_blocknr,
+				 le32_to_cpu(rb->rf_fs_generation));
 		goto out;
 	}
 out:
@@ -1103,10 +1101,9 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
 
 		if (el->l_tree_depth) {
 			ret = ocfs2_error(sb,
-				"refcount tree %llu has non zero tree "
-				"depth in leaf btree tree block %llu\n",
-				(unsigned long long)ocfs2_metadata_cache_owner(ci),
-				(unsigned long long)eb_bh->b_blocknr);
+					  "refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
+					  (unsigned long long)ocfs2_metadata_cache_owner(ci),
+					  (unsigned long long)eb_bh->b_blocknr);
 			goto out;
 		}
 	}
@@ -2358,9 +2355,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
 					   cpos, len, phys);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
-			    "tree, but the feature bit is not set in the "
-			    "super block.", inode->i_ino);
+		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+				  inode->i_ino);
 		goto out;
 	}
 
@@ -2543,9 +2539,8 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
-			    "tree, but the feature bit is not set in the "
-			    "super block.", inode->i_ino);
+		ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+				  inode->i_ino);
 		goto out;
 	}
 
@@ -2670,9 +2665,9 @@ static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
 
 		if (el->l_tree_depth) {
 			ret = ocfs2_error(inode->i_sb,
-				    "Inode %lu has non zero tree depth in "
-				    "leaf block %llu\n", inode->i_ino,
-				    (unsigned long long)eb_bh->b_blocknr);
+					  "Inode %lu has non zero tree depth in leaf block %llu\n",
+					  inode->i_ino,
+					  (unsigned long long)eb_bh->b_blocknr);
 			goto out;
 		}
 	}
@@ -3103,9 +3098,8 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
 	index = ocfs2_search_extent_list(el, cpos);
 	if (index == -1) {
 		ret = ocfs2_error(sb,
-			    "Inode %llu has an extent at cpos %u which can no "
-			    "longer be found.\n",
-			    (unsigned long long)ino, cpos);
+				  "Inode %llu has an extent at cpos %u which can no longer be found\n",
+				  (unsigned long long)ino, cpos);
 		goto out;
 	}
 
@@ -3371,9 +3365,8 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
-		return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
-			    "tree, but the feature bit is not set in the "
-			    "super block.", inode->i_ino);
+		return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
+				   inode->i_ino);
 	}
 
 	ocfs2_init_dealloc_ctxt(&context->dealloc);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e4bb00110e91..0456ae399bf7 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -167,12 +167,12 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
 }
 
 #define do_error(fmt, ...)						\
-	do{								\
-		if (resize)					\
-			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
-		else							\
-			return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
-	} while (0)
+do {									\
+	if (resize)							\
+		mlog(ML_ERROR, fmt, ##__VA_ARGS__);			\
+	else								\
+		return ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
+} while (0)
 
 static int ocfs2_validate_gd_self(struct super_block *sb,
 				  struct buffer_head *bh,
@@ -181,36 +181,32 @@ static int ocfs2_validate_gd_self(struct super_block *sb,
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
-		do_error("Group descriptor #%llu has bad signature %.*s",
+		do_error("Group descriptor #%llu has bad signature %.*s\n",
 			 (unsigned long long)bh->b_blocknr, 7,
 			 gd->bg_signature);
 	}
 
 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
-		do_error("Group descriptor #%llu has an invalid bg_blkno "
-			 "of %llu",
+		do_error("Group descriptor #%llu has an invalid bg_blkno of %llu\n",
 			 (unsigned long long)bh->b_blocknr,
 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
 	}
 
 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
-		do_error("Group descriptor #%llu has an invalid "
-			 "fs_generation of #%u",
+		do_error("Group descriptor #%llu has an invalid fs_generation of #%u\n",
 			 (unsigned long long)bh->b_blocknr,
 			 le32_to_cpu(gd->bg_generation));
 	}
 
 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
-		do_error("Group descriptor #%llu has bit count %u but "
-			 "claims that %u are free",
+		do_error("Group descriptor #%llu has bit count %u but claims that %u are free\n",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits),
 			 le16_to_cpu(gd->bg_free_bits_count));
 	}
 
 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
-		do_error("Group descriptor #%llu has bit count %u but "
-			 "max bitmap bits of %u",
+		do_error("Group descriptor #%llu has bit count %u but max bitmap bits of %u\n",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits),
 			 8 * le16_to_cpu(gd->bg_size));
@@ -228,8 +224,7 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
 	if (di->i_blkno != gd->bg_parent_dinode) {
-		do_error("Group descriptor #%llu has bad parent "
-			 "pointer (%llu, expected %llu)",
+		do_error("Group descriptor #%llu has bad parent pointer (%llu, expected %llu)\n",
 			 (unsigned long long)bh->b_blocknr,
 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
 			 (unsigned long long)le64_to_cpu(di->i_blkno));
@@ -237,7 +232,7 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 
 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
-		do_error("Group descriptor #%llu has bit count of %u",
+		do_error("Group descriptor #%llu has bit count of %u\n",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_bits));
 	}
@@ -247,7 +242,7 @@ static int ocfs2_validate_gd_parent(struct super_block *sb,
 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) ||
 	    ((le16_to_cpu(gd->bg_chain) ==
 	     le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) {
-		do_error("Group descriptor #%llu has bad chain %u",
+		do_error("Group descriptor #%llu has bad chain %u\n",
 			 (unsigned long long)bh->b_blocknr,
 			 le16_to_cpu(gd->bg_chain));
 	}
@@ -376,10 +371,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
 	struct super_block * sb = alloc_inode->i_sb;
 
 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
-		status = ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
-			    "b_blocknr (%llu)",
-			    (unsigned long long)group_blkno,
-			    (unsigned long long) bg_bh->b_blocknr);
+		status = ocfs2_error(alloc_inode->i_sb,
+				     "group block (%llu) != b_blocknr (%llu)\n",
+				     (unsigned long long)group_blkno,
+				     (unsigned long long) bg_bh->b_blocknr);
 		goto bail;
 	}
 
@@ -825,8 +820,9 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 
 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
-		status = ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
-			    (unsigned long long)le64_to_cpu(fe->i_blkno));
+		status = ocfs2_error(alloc_inode->i_sb,
+				     "Invalid chain allocator %llu\n",
+				     (unsigned long long)le64_to_cpu(fe->i_blkno));
 		goto bail;
 	}
 
@@ -1360,11 +1356,11 @@ int ocfs2_block_group_set_bits(handle_t *handle,
 
 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-			    " count %u but claims %u are freed. num_bits %d",
-			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
-			    le16_to_cpu(bg->bg_bits),
-			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
+				   le16_to_cpu(bg->bg_bits),
+				   le16_to_cpu(bg->bg_free_bits_count),
+				   num_bits);
 	}
 	while(num_bits--)
 		ocfs2_set_bit(bit_off++, bitmap);
@@ -1895,11 +1891,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
 		status = ocfs2_error(ac->ac_inode->i_sb,
-			    "Chain allocator dinode %llu has %u used "
-			    "bits but only %u total.",
-			    (unsigned long long)le64_to_cpu(fe->i_blkno),
-			    le32_to_cpu(fe->id1.bitmap1.i_used),
-			    le32_to_cpu(fe->id1.bitmap1.i_total));
+				     "Chain allocator dinode %llu has %u used bits but only %u total\n",
+				     (unsigned long long)le64_to_cpu(fe->i_blkno),
+				     le32_to_cpu(fe->id1.bitmap1.i_used),
+				     le32_to_cpu(fe->id1.bitmap1.i_total));
 		goto bail;
 	}
 
@@ -2417,11 +2412,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 	}
 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
-		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
-			    " count %u but claims %u are freed. num_bits %d",
-			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
-			    le16_to_cpu(bg->bg_bits),
-			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
+		return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n",
+				   (unsigned long long)le64_to_cpu(bg->bg_blkno),
+				   le16_to_cpu(bg->bg_bits),
+				   le16_to_cpu(bg->bg_free_bits_count),
+				   num_bits);
 	}
 
 	if (undo_fn)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e79058ecfb4b..3a9a1af39ad7 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2600,7 +2600,7 @@ int __ocfs2_error(struct super_block *sb, const char *function,
 
 	/* Not using mlog here because we want to show the actual
 	 * function the error came from. */
-	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n",
+	printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV",
 	       sb->s_id, function, &vaf);
 
 	va_end(args);
@@ -2622,7 +2622,7 @@ void __ocfs2_abort(struct super_block *sb, const char *function,
 	vaf.fmt = fmt;
 	vaf.va = &args;
 
-	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n",
+	printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV",
 	       sb->s_id, function, &vaf);
 
 	va_end(args);
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index c1c87d90542c..b477d0b1c7b6 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -35,13 +35,15 @@ __printf(3, 4)
 int __ocfs2_error(struct super_block *sb, const char *function,
 		   const char *fmt, ...);
 
-#define ocfs2_error(sb, fmt, args...) __ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_error(sb, fmt, ...)					\
+	__ocfs2_error(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
 
 __printf(3, 4)
 void __ocfs2_abort(struct super_block *sb, const char *function,
 		   const char *fmt, ...);
 
-#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+#define ocfs2_abort(sb, fmt, ...)					\
+	__ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##__VA_ARGS__)
 
 /*
  * Void signal blockers, because in-kernel sigprocmask() only fails
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 5944a311bb94..ebfdea78659b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -500,26 +500,23 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
 
 	if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
 		return ocfs2_error(sb,
-			    "Extended attribute block #%llu has bad "
-			    "signature %.*s",
-			    (unsigned long long)bh->b_blocknr, 7,
-			    xb->xb_signature);
+				   "Extended attribute block #%llu has bad signature %.*s\n",
+				   (unsigned long long)bh->b_blocknr, 7,
+				   xb->xb_signature);
 	}
 
 	if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
 		return ocfs2_error(sb,
-			    "Extended attribute block #%llu has an "
-			    "invalid xb_blkno of %llu",
-			    (unsigned long long)bh->b_blocknr,
-			    (unsigned long long)le64_to_cpu(xb->xb_blkno));
+				   "Extended attribute block #%llu has an invalid xb_blkno of %llu\n",
+				   (unsigned long long)bh->b_blocknr,
+				   (unsigned long long)le64_to_cpu(xb->xb_blkno));
 	}
 
 	if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
 		return ocfs2_error(sb,
-			    "Extended attribute block #%llu has an invalid "
-			    "xb_fs_generation of #%u",
-			    (unsigned long long)bh->b_blocknr,
-			    le32_to_cpu(xb->xb_fs_generation));
+				   "Extended attribute block #%llu has an invalid xb_fs_generation of #%u\n",
+				   (unsigned long long)bh->b_blocknr,
+				   le32_to_cpu(xb->xb_fs_generation));
 	}
 
 	return 0;
@@ -3692,9 +3689,9 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
 
 		if (el->l_tree_depth) {
 			ret = ocfs2_error(inode->i_sb,
-				    "Inode %lu has non zero tree depth in "
-				    "xattr tree block %llu\n", inode->i_ino,
-				    (unsigned long long)eb_bh->b_blocknr);
+					  "Inode %lu has non zero tree depth in xattr tree block %llu\n",
+					  inode->i_ino,
+					  (unsigned long long)eb_bh->b_blocknr);
 			goto out;
 		}
 	}
@@ -3709,10 +3706,10 @@ static int ocfs2_xattr_get_rec(struct inode *inode,
 	}
 
 	if (!e_blkno) {
-		ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
-			    "record (%u, %u, 0) in xattr", inode->i_ino,
-			    le32_to_cpu(rec->e_cpos),
-			    ocfs2_rec_clusters(el, rec));
+		ret = ocfs2_error(inode->i_sb, "Inode %lu has bad extent record (%u, %u, 0) in xattr\n",
+				  inode->i_ino,
+				  le32_to_cpu(rec->e_cpos),
+				  ocfs2_rec_clusters(el, rec));
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 46359295a352e01a5a017297c70b7ee0c5da6de6 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Fri, 4 Sep 2015 15:44:54 -0700
Subject: ocfs2: clean up redundant NULL checks before kfree

NULL check before kfree is redundant and so clean them up.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/alloc.c    | 2 +-
 fs/ocfs2/suballoc.c | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index b20706e8a4d1..86181d6526dc 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6178,7 +6178,7 @@ bail:
 		iput(tl_inode);
 	brelse(tl_bh);
 
-	if (status < 0 && (*tl_copy)) {
+	if (status < 0) {
 		kfree(*tl_copy);
 		*tl_copy = NULL;
 		mlog_errno(status);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 0456ae399bf7..d83d2602cf2b 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -149,10 +149,8 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 	brelse(ac->ac_bh);
 	ac->ac_bh = NULL;
 	ac->ac_resv = NULL;
-	if (ac->ac_find_loc_priv) {
-		kfree(ac->ac_find_loc_priv);
-		ac->ac_find_loc_priv = NULL;
-	}
+	kfree(ac->ac_find_loc_priv);
+	ac->ac_find_loc_priv = NULL;
 }
 
 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
-- 
cgit v1.2.3


From a068acf2ee77693e0bf39d6e07139ba704f461c3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 4 Sep 2015 15:44:57 -0700
Subject: fs: create and use seq_show_option for escaping

Many file systems that implement the show_options hook fail to correctly
escape their output which could lead to unescaped characters (e.g.  new
lines) leaking into /proc/mounts and /proc/[pid]/mountinfo files.  This
could lead to confusion, spoofed entries (resulting in things like
systemd issuing false d-bus "mount" notifications), and who knows what
else.  This looks like it would only be the root user stepping on
themselves, but it's possible weird things could happen in containers or
in other situations with delegated mount privileges.

Here's an example using overlay with setuid fusermount trusting the
contents of /proc/mounts (via the /etc/mtab symlink).  Imagine the use
of "sudo" is something more sneaky:

  $ BASE="ovl"
  $ MNT="$BASE/mnt"
  $ LOW="$BASE/lower"
  $ UP="$BASE/upper"
  $ WORK="$BASE/work/ 0 0
  none /proc fuse.pwn user_id=1000"
  $ mkdir -p "$LOW" "$UP" "$WORK"
  $ sudo mount -t overlay -o "lowerdir=$LOW,upperdir=$UP,workdir=$WORK" none /mnt
  $ cat /proc/mounts
  none /root/ovl/mnt overlay rw,relatime,lowerdir=ovl/lower,upperdir=ovl/upper,workdir=ovl/work/ 0 0
  none /proc fuse.pwn user_id=1000 0 0
  $ fusermount -u /proc
  $ cat /proc/mounts
  cat: /proc/mounts: No such file or directory

This fixes the problem by adding new seq_show_option and
seq_show_option_n helpers, and updating the vulnerable show_option
handlers to use them as needed.  Some, like SELinux, need to be open
coded due to unusual existing escape mechanisms.

[akpm@linux-foundation.org: add lost chunk, per Kees]
[keescook@chromium.org: seq_show_option should be using const parameters]
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Acked-by: Jan Kara <jack@suse.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Cc: J. R. Okajima <hooanon05g@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ceph/super.c          |  2 +-
 fs/cifs/cifsfs.c         |  6 +++---
 fs/ext4/super.c          |  4 ++--
 fs/gfs2/super.c          |  6 +++---
 fs/hfs/super.c           |  4 ++--
 fs/hfsplus/options.c     |  4 ++--
 fs/hostfs/hostfs_kern.c  |  2 +-
 fs/ocfs2/super.c         |  4 ++--
 fs/overlayfs/super.c     |  6 +++---
 fs/reiserfs/super.c      |  8 +++++---
 fs/xfs/xfs_super.c       |  4 ++--
 include/linux/seq_file.h | 35 +++++++++++++++++++++++++++++++++++
 kernel/cgroup.c          |  7 ++++---
 net/ceph/ceph_common.c   |  7 +++++--
 security/selinux/hooks.c |  2 +-
 15 files changed, 71 insertions(+), 30 deletions(-)

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d1c833c321b9..7b6bfcbf801c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -479,7 +479,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
 	if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
 		seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
 	if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
-		seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
+		seq_show_option(m, "snapdirname", fsopt->snapdir_name);
 
 	return 0;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a9fb6b53126..6a1119e87fbb 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -394,17 +394,17 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	struct sockaddr *srcaddr;
 	srcaddr = (struct sockaddr *)&tcon->ses->server->srcaddr;
 
-	seq_printf(s, ",vers=%s", tcon->ses->server->vals->version_string);
+	seq_show_option(s, "vers", tcon->ses->server->vals->version_string);
 	cifs_show_security(s, tcon->ses);
 	cifs_show_cache_flavor(s, cifs_sb);
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
 		seq_puts(s, ",multiuser");
 	else if (tcon->ses->user_name)
-		seq_printf(s, ",username=%s", tcon->ses->user_name);
+		seq_show_option(s, "username", tcon->ses->user_name);
 
 	if (tcon->ses->domainName)
-		seq_printf(s, ",domain=%s", tcon->ses->domainName);
+		seq_show_option(s, "domain", tcon->ses->domainName);
 
 	if (srcaddr->sa_family != AF_UNSPEC) {
 		struct sockaddr_in *saddr4;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ee3878262a49..a63c7b0a10cf 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1776,10 +1776,10 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
 	}
 
 	if (sbi->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+		seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]);
 
 	if (sbi->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+		seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]);
 #endif
 }
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2982445947e1..894fb01a91da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1334,11 +1334,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 	if (is_ancestor(root, sdp->sd_master_dir))
 		seq_puts(s, ",meta");
 	if (args->ar_lockproto[0])
-		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+		seq_show_option(s, "lockproto", args->ar_lockproto);
 	if (args->ar_locktable[0])
-		seq_printf(s, ",locktable=%s", args->ar_locktable);
+		seq_show_option(s, "locktable", args->ar_locktable);
 	if (args->ar_hostdata[0])
-		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+		seq_show_option(s, "hostdata", args->ar_hostdata);
 	if (args->ar_spectator)
 		seq_puts(s, ",spectator");
 	if (args->ar_localflocks)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 55c03b9e9070..4574fdd3d421 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -136,9 +136,9 @@ static int hfs_show_options(struct seq_file *seq, struct dentry *root)
 	struct hfs_sb_info *sbi = HFS_SB(root->d_sb);
 
 	if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
-		seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+		seq_show_option_n(seq, "creator", (char *)&sbi->s_creator, 4);
 	if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
-		seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+		seq_show_option_n(seq, "type", (char *)&sbi->s_type, 4);
 	seq_printf(seq, ",uid=%u,gid=%u",
 			from_kuid_munged(&init_user_ns, sbi->s_uid),
 			from_kgid_munged(&init_user_ns, sbi->s_gid));
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c90b72ee676d..bb806e58c977 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -218,9 +218,9 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(root->d_sb);
 
 	if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
-		seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+		seq_show_option_n(seq, "creator", (char *)&sbi->creator, 4);
 	if (sbi->type != HFSPLUS_DEF_CR_TYPE)
-		seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+		seq_show_option_n(seq, "type", (char *)&sbi->type, 4);
 	seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask,
 			from_kuid_munged(&init_user_ns, sbi->uid),
 			from_kgid_munged(&init_user_ns, sbi->gid));
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 059597b23f67..2ac99db3750e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -260,7 +260,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
 	size_t offset = strlen(root_ino) + 1;
 
 	if (strlen(root_path) > offset)
-		seq_printf(seq, ",%s", root_path + offset);
+		seq_show_option(seq, root_path + offset, NULL);
 
 	if (append)
 		seq_puts(seq, ",append");
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3a9a1af39ad7..2de4c8a9340c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1563,8 +1563,8 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",localflocks,");
 
 	if (osb->osb_cluster_stack[0])
-		seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
-			   osb->osb_cluster_stack);
+		seq_show_option_n(s, "cluster_stack", osb->osb_cluster_stack,
+				  OCFS2_STACK_LABEL_LEN);
 	if (opts & OCFS2_MOUNT_USRQUOTA)
 		seq_printf(s, ",usrquota");
 	if (opts & OCFS2_MOUNT_GRPQUOTA)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 7466ff339c66..79073d68b475 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -588,10 +588,10 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 	struct super_block *sb = dentry->d_sb;
 	struct ovl_fs *ufs = sb->s_fs_info;
 
-	seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir);
+	seq_show_option(m, "lowerdir", ufs->config.lowerdir);
 	if (ufs->config.upperdir) {
-		seq_printf(m, ",upperdir=%s", ufs->config.upperdir);
-		seq_printf(m, ",workdir=%s", ufs->config.workdir);
+		seq_show_option(m, "upperdir", ufs->config.upperdir);
+		seq_show_option(m, "workdir", ufs->config.workdir);
 	}
 	return 0;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0e4cf728126f..4a62fe8cc3bf 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -714,18 +714,20 @@ static int reiserfs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",acl");
 
 	if (REISERFS_SB(s)->s_jdev)
-		seq_printf(seq, ",jdev=%s", REISERFS_SB(s)->s_jdev);
+		seq_show_option(seq, "jdev", REISERFS_SB(s)->s_jdev);
 
 	if (journal->j_max_commit_age != journal->j_default_max_commit_age)
 		seq_printf(seq, ",commit=%d", journal->j_max_commit_age);
 
 #ifdef CONFIG_QUOTA
 	if (REISERFS_SB(s)->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", REISERFS_SB(s)->s_qf_names[USRQUOTA]);
+		seq_show_option(seq, "usrjquota",
+				REISERFS_SB(s)->s_qf_names[USRQUOTA]);
 	else if (opts & (1 << REISERFS_USRQUOTA))
 		seq_puts(seq, ",usrquota");
 	if (REISERFS_SB(s)->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
+		seq_show_option(seq, "grpjquota",
+				REISERFS_SB(s)->s_qf_names[GRPQUOTA]);
 	else if (opts & (1 << REISERFS_GRPQUOTA))
 		seq_puts(seq, ",grpquota");
 	if (REISERFS_SB(s)->s_jquota_fmt) {
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 1fb16562c159..bbd9b1f10ffb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -511,9 +511,9 @@ xfs_showargs(
 		seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
 
 	if (mp->m_logname)
-		seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+		seq_show_option(m, MNTOPT_LOGDEV, mp->m_logname);
 	if (mp->m_rtname)
-		seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+		seq_show_option(m, MNTOPT_RTDEV, mp->m_rtname);
 
 	if (mp->m_dalign > 0)
 		seq_printf(m, "," MNTOPT_SUNIT "=%d",
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 912a7c482649..d4c7271382cb 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -149,6 +149,41 @@ static inline struct user_namespace *seq_user_ns(struct seq_file *seq)
 #endif
 }
 
+/**
+ * seq_show_options - display mount options with appropriate escapes.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, can be NULL
+ */
+static inline void seq_show_option(struct seq_file *m, const char *name,
+				   const char *value)
+{
+	seq_putc(m, ',');
+	seq_escape(m, name, ",= \t\n\\");
+	if (value) {
+		seq_putc(m, '=');
+		seq_escape(m, value, ", \t\n\\");
+	}
+}
+
+/**
+ * seq_show_option_n - display mount options with appropriate escapes
+ *		       where @value must be a specific length.
+ * @m: the seq_file handle
+ * @name: the mount option name
+ * @value: the mount option name's value, cannot be NULL
+ * @length: the length of @value to display
+ *
+ * This is a macro since this uses "length" to define the size of the
+ * stack buffer.
+ */
+#define seq_show_option_n(m, name, value, length) {	\
+	char val_buf[length + 1];			\
+	strncpy(val_buf, value, length);		\
+	val_buf[length] = '\0';				\
+	seq_show_option(m, name, val_buf);		\
+}
+
 #define SEQ_START_TOKEN ((void *)1)
 /*
  * Helpers for iteration over list_head-s in seq_files
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..a8538e443784 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	if (root != &cgrp_dfl_root)
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
-				seq_printf(seq, ",%s", ss->legacy_name);
+				seq_show_option(seq, ss->name, NULL);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
 
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
-		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+		seq_show_option(seq, "release_agent",
+				root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
-		seq_printf(seq, ",name=%s", root->name);
+		seq_show_option(seq, "name", root->name);
 	return 0;
 }
 
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index f30329f72641..69a4d30a9ccf 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -517,8 +517,11 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 	struct ceph_options *opt = client->options;
 	size_t pos = m->count;
 
-	if (opt->name)
-		seq_printf(m, "name=%s,", opt->name);
+	if (opt->name) {
+		seq_puts(m, "name=");
+		seq_escape(m, opt->name, ", \t\n\\");
+		seq_putc(m, ',');
+	}
 	if (opt->key)
 		seq_puts(m, "secret=<hidden>,");
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 564079c5c49d..cdf4c589a391 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1100,7 +1100,7 @@ static void selinux_write_opts(struct seq_file *m,
 		seq_puts(m, prefix);
 		if (has_comma)
 			seq_putc(m, '\"');
-		seq_puts(m, opts->mnt_opts[i]);
+		seq_escape(m, opts->mnt_opts[i], "\"\n\\");
 		if (has_comma)
 			seq_putc(m, '\"');
 	}
-- 
cgit v1.2.3


From 5869b5064b0950afc447610c0f5d4134b71d9e94 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 4 Sep 2015 15:45:00 -0700
Subject: smpboot: fix memory leak on error handling

The cpumask is allocated before threads get created. If the latter step
fails, we need to free the cpumask.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smpboot.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..71aa90b69f8f 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -301,6 +301,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 		ret = __smpboot_create_thread(plug_thread, cpu);
 		if (ret) {
 			smpboot_destroy_threads(plug_thread);
+			free_cpumask_var(plug_thread->cpumask);
 			goto out;
 		}
 		smpboot_unpark_thread(plug_thread, cpu);
-- 
cgit v1.2.3


From 3dd08c0c918f9bf058572ddbf26e7d6fb5674a5c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 4 Sep 2015 15:45:03 -0700
Subject: smpboot: make cleanup to mirror setup

The per-cpu kthread cleanup() callback is the mirror of the setup()
callback.  When the per-cpu kthread is started, it first calls setup()
to initialize the resources which are then released by cleanup() when
the kthread exits.

Now since the introduction of a per-cpu kthread cpumask, the kthreads
excluded by the cpumask on boot may happen to be parked immediately
after their creation without taking the setup() stage, waiting to be
asked to unpark to do so.  Then when smpboot_unregister_percpu_thread()
is later called, the kthread is stopped without having ever called
setup().

But this triggers a bug as the kthread unconditionally calls cleanup()
on exit but this doesn't mirror any setup().  Thus the kernel crashes
because we try to free resources that haven't been initialized, as in
the watchdog case:

    WATCHDOG disable 0
    WATCHDOG disable 1
    WATCHDOG disable 2
    BUG: unable to handle kernel NULL pointer dereference at           (null)
    IP: hrtimer_active+0x26/0x60
    [...]
    Call Trace:
      hrtimer_try_to_cancel+0x1c/0x280
      hrtimer_cancel+0x1d/0x30
      watchdog_disable+0x56/0x70
      watchdog_cleanup+0xe/0x10
      smpboot_thread_fn+0x23c/0x2c0
      kthread+0xf8/0x110
      ret_from_fork+0x3f/0x70

This bug is currently masked with explicit kthread unparking before
kthread_stop() on smpboot_destroy_threads(). This forces a call to
setup() and then unpark().

We could fix this by unconditionally calling setup() on kthread entry.
But setup() isn't always cheap.  In the case of watchdog it launches
hrtimer, perf events, etc...  So we may as well like to skip it if there
are chances the kthread will never be used, as in a reduced cpumask value.

So let's simply do a state machine check before calling cleanup() that
makes sure setup() has been called before mirroring it.

And remove the nasty hack workaround.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smpboot.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 71aa90b69f8f..60aa858a6a07 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
 		if (kthread_should_stop()) {
 			__set_current_state(TASK_RUNNING);
 			preempt_enable();
-			if (ht->cleanup)
+			/* cleanup must mirror setup */
+			if (ht->cleanup && td->status != HP_THREAD_NONE)
 				ht->cleanup(td->cpu, cpu_online(td->cpu));
 			kfree(td);
 			return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 {
 	unsigned int cpu;
 
-	/* Unpark any threads that were voluntarily parked. */
-	for_each_cpu_not(cpu, ht->cpumask) {
-		if (cpu_online(cpu)) {
-			struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-			if (tsk)
-				kthread_unpark(tsk);
-		}
-	}
-
 	/* We need to destroy also the parked threads of offline cpus */
 	for_each_possible_cpu(cpu) {
 		struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
-- 
cgit v1.2.3


From 230ec93909f00678401cb2d63b8b95f1dea68e40 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 4 Sep 2015 15:45:06 -0700
Subject: smpboot: allow passing the cpumask on per-cpu thread registration

It makes the registration cheaper and simpler for the smpboot per-cpu
kthread users that don't need to always update the cpumask after threads
creation.

[sfr@canb.auug.org.au: fix for allow passing the cpumask on per-cpu thread registration]
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smpboot.h | 11 ++++++++++-
 kernel/smpboot.c        | 14 +++++++++-----
 kernel/watchdog.c       |  9 +++------
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index da3c593f9845..e6109a6cd8f6 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -48,7 +48,16 @@ struct smp_hotplug_thread {
 	const char			*thread_comm;
 };
 
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+					   const struct cpumask *cpumask);
+
+static inline int
+smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+	return smpboot_register_percpu_thread_cpumask(plug_thread,
+						      cpu_possible_mask);
+}
+
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
 int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
 					 const struct cpumask *);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 60aa858a6a07..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -273,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 
 /**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * 					    to hotplug
  * @plug_thread:	Hotplug thread descriptor
+ * @cpumask:		The cpumask where threads run
  *
  * Creates and starts the threads on all online cpus.
  */
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+					   const struct cpumask *cpumask)
 {
 	unsigned int cpu;
 	int ret = 0;
 
 	if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
 		return -ENOMEM;
-	cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+	cpumask_copy(plug_thread->cpumask, cpumask);
 
 	get_online_cpus();
 	mutex_lock(&smpboot_threads_lock);
@@ -296,7 +299,8 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
 			free_cpumask_var(plug_thread->cpumask);
 			goto out;
 		}
-		smpboot_unpark_thread(plug_thread, cpu);
+		if (cpumask_test_cpu(cpu, cpumask))
+			smpboot_unpark_thread(plug_thread, cpu);
 	}
 	list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -304,7 +308,7 @@ out:
 	put_online_cpus();
 	return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
 
 /**
  * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..e5bb86fb0ea5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -713,15 +713,12 @@ static int watchdog_enable_all_cpus(void)
 	int err = 0;
 
 	if (!watchdog_running) {
-		err = smpboot_register_percpu_thread(&watchdog_threads);
+		err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+							     &watchdog_cpumask);
 		if (err)
 			pr_err("Failed to create watchdog threads, disabled\n");
-		else {
-			if (smpboot_update_cpumask_percpu_thread(
-				    &watchdog_threads, &watchdog_cpumask))
-				pr_err("Failed to set cpumask for watchdog threads\n");
+		else
 			watchdog_running = 1;
-		}
 	} else {
 		/*
 		 * Enable/disable the lockup detectors or
-- 
cgit v1.2.3


From 314b08ff5205420d956d14657e16d92c460a6f21 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 4 Sep 2015 15:45:09 -0700
Subject: watchdog: simplify housekeeping affinity with the appropriate mask

housekeeping_mask gathers all the CPUs that aren't part of the nohz_full
set.  This is exactly what we want the watchdog to be affine to without
the need to use complicated cpumask operations.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e5bb86fb0ea5..d18330fa4776 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -929,10 +929,8 @@ void __init lockup_detector_init(void)
 
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_enabled()) {
-		if (!cpumask_empty(tick_nohz_full_mask))
-			pr_info("Disabling watchdog on nohz_full cores by default\n");
-		cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
-			       tick_nohz_full_mask);
+		pr_info("Disabling watchdog on nohz_full cores by default\n");
+		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
 	} else
 		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
 #else
-- 
cgit v1.2.3


From aacfbe6a9724bb6d66a656a5abcc681d5649ed92 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Fri, 4 Sep 2015 15:45:12 -0700
Subject: kernel/watchdog: move NMI function header declarations from
 watchdog.h to nmi.h

The kernel's NMI watchdog has nothing to do with the watchdog subsystem.
Its header declarations should be in linux/nmi.h, not linux/watchdog.h.

The code provided two sets of dummy functions if HARDLOCKUP_DETECTOR is
not configured, one in the include file and one in kernel/watchdog.c.
Remove the dummy functions from kernel/watchdog.c and use those from the
include file.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 include/linux/nmi.h                    | 8 +++++---
 include/linux/watchdog.h               | 8 --------
 kernel/watchdog.c                      | 2 --
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 3f124d553c5a..36bd8250934b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
-#include <linux/watchdog.h>
+#include <linux/nmi.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f94da0e65dea..088714537d10 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -26,10 +26,12 @@ static inline void touch_nmi_watchdog(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
+void watchdog_nmi_disable_all(void);
+void watchdog_nmi_enable_all(void);
 #else
-static inline void hardlockup_detector_disable(void)
-{
-}
+static inline void hardlockup_detector_disable(void) {}
+static inline void watchdog_nmi_disable_all(void) {}
+static inline void watchdog_nmi_enable_all(void) {}
 #endif
 
 /*
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index f47feada5b42..d74a0e907b9e 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd,
 extern int watchdog_register_device(struct watchdog_device *);
 extern void watchdog_unregister_device(struct watchdog_device *);
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
-#else
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
-#endif
-
 #endif  /* ifndef _LINUX_WATCHDOG_H */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d18330fa4776..e74d48bc3e61 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -651,8 +651,6 @@ unlock:
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
 #endif /* CONFIG_HARDLOCKUP_DETECTOR */
 
 static struct smp_hotplug_thread watchdog_threads = {
-- 
cgit v1.2.3


From 81a4beef91ba4a9e8ad6054ca9933dff7e25ff28 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:15 -0700
Subject: watchdog: introduce watchdog_park_threads() and
 watchdog_unpark_threads()

Originally watchdog_nmi_enable(cpu) and watchdog_nmi_disable(cpu) were
only called in watchdog thread context.  However, the following commits
utilize these functions outside of watchdog thread context too.

  commit 9809b18fcf6b8d8ec4d3643677345907e6b50eca
  Author: Michal Hocko <mhocko@suse.cz>
  Date:   Tue Sep 24 15:27:30 2013 -0700

      watchdog: update watchdog_thresh properly

  commit b3738d29323344da3017a91010530cf3a58590fc
  Author: Stephane Eranian <eranian@google.com>
  Date:   Mon Nov 17 20:07:03 2014 +0100

      watchdog: Add watchdog enable/disable all functions

Hence, it is now possible that these functions execute concurrently with
the same 'cpu' argument.  This concurrency is problematic because per-cpu
'watchdog_ev' can be accessed/modified without adequate synchronization.

The patch series aims to address the above problem.  However, instead of
introducing locks to protect per-cpu 'watchdog_ev' a different approach is
taken: Invoke these functions by parking and unparking the watchdog
threads (to ensure they are always called in watchdog thread context).

  static struct smp_hotplug_thread watchdog_threads = {
           ...
          .park   = watchdog_disable, // calls watchdog_nmi_disable()
          .unpark = watchdog_enable,  // calls watchdog_nmi_enable()
  };

Both previously mentioned commits call these functions in a similar way
and thus in principle contain some duplicate code.  The patch series also
avoids this duplication by providing a commonly usable mechanism.

- Patch 1/4 introduces the watchdog_{park|unpark}_threads functions that
  park/unpark all watchdog threads specified in 'watchdog_cpumask'. They
  are intended to be called inside of kernel/watchdog.c only.

- Patch 2/4 introduces the watchdog_{suspend|resume} functions which can
  be utilized by external callers to deactivate the hard and soft lockup
  detector temporarily.

- Patch 3/4 utilizes watchdog_{park|unpark}_threads to replace some code
  that was introduced by commit 9809b18fcf6b8d8ec4d3643677345907e6b50eca.

- Patch 4/4 utilizes watchdog_{suspend|resume} to replace some code that
  was introduced by commit b3738d29323344da3017a91010530cf3a58590fc.

A few corner cases should be mentioned here for completeness.

- kthread_park() of watchdog/N could hang if cpu N is already locked up.
  However, if watchdog is enabled the lockup will be detected anyway.

- kthread_unpark() of watchdog/N could hang if cpu N got locked up after
  kthread_park(). The occurrence of this scenario should be _very_ rare
  in practice, in particular because it is not expected that temporary
  deactivation will happen frequently, and if it happens at all it is
  expected that the duration of deactivation will be short.

This patch (of 4): introduce watchdog_park_threads() and watchdog_unpark_threads()

These functions are intended to be used only from inside kernel/watchdog.c
to park/unpark all watchdog threads that are specified in
watchdog_cpumask.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e74d48bc3e61..6c489e49c610 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
+#include <linux/kthread.h>
 
 /*
  * The run state of the lockup detectors is controlled by the content of the
@@ -664,6 +665,41 @@ static struct smp_hotplug_thread watchdog_threads = {
 	.unpark			= watchdog_enable,
 };
 
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
+{
+	int cpu, ret = 0;
+
+	get_online_cpus();
+	for_each_watchdog_cpu(cpu) {
+		ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+		if (ret)
+			break;
+	}
+	if (ret) {
+		for_each_watchdog_cpu(cpu)
+			kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+	}
+	put_online_cpus();
+
+	return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+	int cpu;
+
+	get_online_cpus();
+	for_each_watchdog_cpu(cpu)
+		kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+	put_online_cpus();
+}
+
 static void restart_watchdog_hrtimer(void *info)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
-- 
cgit v1.2.3


From 8c073d27d7ad293bf734cc8475689413afadab81 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:18 -0700
Subject: watchdog: introduce watchdog_suspend() and watchdog_resume()

This interface can be utilized to deactivate the hard and soft lockup
detector temporarily.  Callers are expected to minimize the duration of
deactivation.  Multiple deactivations are allowed to occur in parallel but
should be rare in practice.

[akpm@linux-foundation.org: remove unneeded static initialization]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nmi.h |  2 ++
 kernel/watchdog.c   | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 088714537d10..e9f213c337bb 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -82,6 +82,8 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
+extern int watchdog_suspend(void);
+extern void watchdog_resume(void);
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6c489e49c610..e6eb5b697212 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -67,6 +67,7 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
+static int __read_mostly watchdog_suspended;
 static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
 
@@ -700,6 +701,50 @@ static void watchdog_unpark_threads(void)
 	put_online_cpus();
 }
 
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int watchdog_suspend(void)
+{
+	int ret = 0;
+
+	mutex_lock(&watchdog_proc_mutex);
+	/*
+	 * Multiple suspend requests can be active in parallel (counted by
+	 * the 'watchdog_suspended' variable). If the watchdog threads are
+	 * running, the first caller takes care that they will be parked.
+	 * The state of 'watchdog_running' cannot change while a suspend
+	 * request is active (see related changes in 'proc' handlers).
+	 */
+	if (watchdog_running && !watchdog_suspended)
+		ret = watchdog_park_threads();
+
+	if (ret == 0)
+		watchdog_suspended++;
+
+	mutex_unlock(&watchdog_proc_mutex);
+
+	return ret;
+}
+
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void watchdog_resume(void)
+{
+	mutex_lock(&watchdog_proc_mutex);
+
+	watchdog_suspended--;
+	/*
+	 * The watchdog threads are unparked if they were previously running
+	 * and if there is no more active suspend request.
+	 */
+	if (watchdog_running && !watchdog_suspended)
+		watchdog_unpark_threads();
+
+	mutex_unlock(&watchdog_proc_mutex);
+}
+
 static void restart_watchdog_hrtimer(void *info)
 {
 	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
@@ -818,6 +863,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
 
 	mutex_lock(&watchdog_proc_mutex);
 
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	/*
 	 * If the parameter is being read return the state of the corresponding
 	 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -903,6 +954,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
 
 	mutex_lock(&watchdog_proc_mutex);
 
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	old = ACCESS_ONCE(watchdog_thresh);
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
@@ -934,6 +991,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 	int err;
 
 	mutex_lock(&watchdog_proc_mutex);
+
+	if (watchdog_suspended) {
+		/* no parameter changes allowed while watchdog is suspended */
+		err = -EAGAIN;
+		goto out;
+	}
+
 	err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
 	if (!err && write) {
 		/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -951,6 +1015,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 				pr_err("cpumask update failed\n");
 		}
 	}
+out:
 	mutex_unlock(&watchdog_proc_mutex);
 	return err;
 }
-- 
cgit v1.2.3


From d4bdd0b21c7652a8271f873cc755486b255c1bbd Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:21 -0700
Subject: watchdog: use park/unpark functions in update_watchdog_all_cpus()

Remove update_watchdog() and restart_watchdog_hrtimer() since these
functions are no longer needed.  Changes of parameters such as the sample
period are honored at the time when the watchdog threads are being
unparked.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 40 ++--------------------------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index e6eb5b697212..eb8f94b50101 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -745,46 +745,10 @@ void watchdog_resume(void)
 	mutex_unlock(&watchdog_proc_mutex);
 }
 
-static void restart_watchdog_hrtimer(void *info)
-{
-	struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
-	int ret;
-
-	/*
-	 * No need to cancel and restart hrtimer if it is currently executing
-	 * because it will reprogram itself with the new period now.
-	 * We should never see it unqueued here because we are running per-cpu
-	 * with interrupts disabled.
-	 */
-	ret = hrtimer_try_to_cancel(hrtimer);
-	if (ret == 1)
-		hrtimer_start(hrtimer, ns_to_ktime(sample_period),
-				HRTIMER_MODE_REL_PINNED);
-}
-
-static void update_watchdog(int cpu)
-{
-	/*
-	 * Make sure that perf event counter will adopt to a new
-	 * sampling period. Updating the sampling period directly would
-	 * be much nicer but we do not have an API for that now so
-	 * let's use a big hammer.
-	 * Hrtimer will adopt the new period on the next tick but this
-	 * might be late already so we have to restart the timer as well.
-	 */
-	watchdog_nmi_disable(cpu);
-	smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
-	watchdog_nmi_enable(cpu);
-}
-
 static void update_watchdog_all_cpus(void)
 {
-	int cpu;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		update_watchdog(cpu);
-	put_online_cpus();
+	watchdog_park_threads();
+	watchdog_unpark_threads();
 }
 
 static int watchdog_enable_all_cpus(void)
-- 
cgit v1.2.3


From 999bbe49ea0118b70ddf3f5d679f51dc7a97ae55 Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:25 -0700
Subject: watchdog: use suspend/resume interface in fixup_ht_bug()

Remove watchdog_nmi_disable_all() and watchdog_nmi_enable_all() since
these functions are no longer needed.  If a subsystem has a need to
deactivate the watchdog temporarily, it should utilize the
watchdog_suspend() and watchdog_resume() functions.

[akpm@linux-foundation.org: fix build with CONFIG_LOCKUP_DETECTOR=m]
Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c |  7 +++++--
 include/linux/nmi.h                    | 13 +++++++++----
 kernel/watchdog.c                      | 35 ----------------------------------
 3 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 36bd8250934b..144ab91951a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	watchdog_nmi_disable_all();
+	if (watchdog_suspend() != 0) {
+		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
+		return 0;
+	}
 
 	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
 
@@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_nmi_enable_all();
+	watchdog_resume();
 
 	get_online_cpus();
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e9f213c337bb..e5afe8bae202 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -26,12 +26,8 @@ static inline void touch_nmi_watchdog(void)
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
 extern void hardlockup_detector_disable(void);
-void watchdog_nmi_disable_all(void);
-void watchdog_nmi_enable_all(void);
 #else
 static inline void hardlockup_detector_disable(void) {}
-static inline void watchdog_nmi_disable_all(void) {}
-static inline void watchdog_nmi_enable_all(void) {}
 #endif
 
 /*
@@ -84,6 +80,15 @@ extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int watchdog_suspend(void);
 extern void watchdog_resume(void);
+#else
+static inline int watchdog_suspend(void)
+{
+	return 0;
+}
+
+static inline void watchdog_resume(void)
+{
+}
 #endif
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index eb8f94b50101..69666f4b8e8f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -615,41 +615,6 @@ static void watchdog_nmi_disable(unsigned int cpu)
 	}
 }
 
-void watchdog_nmi_enable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_enable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
-	int cpu;
-
-	mutex_lock(&watchdog_proc_mutex);
-
-	if (!watchdog_running)
-		goto unlock;
-
-	get_online_cpus();
-	for_each_watchdog_cpu(cpu)
-		watchdog_nmi_disable(cpu);
-	put_online_cpus();
-
-unlock:
-	mutex_unlock(&watchdog_proc_mutex);
-}
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
 static void watchdog_nmi_disable(unsigned int cpu) { return; }
-- 
cgit v1.2.3


From ec6a90661a0d6ce1461d05c7a58a0a151154e14a Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell <uobergfe@redhat.com>
Date: Fri, 4 Sep 2015 15:45:28 -0700
Subject: watchdog: rename watchdog_suspend() and watchdog_resume()

Rename watchdog_suspend() to lockup_detector_suspend() and
watchdog_resume() to lockup_detector_resume() to avoid confusion with the
watchdog subsystem and to be consistent with the existing name
lockup_detector_init().

Also provide comment blocks to explain the watchdog_running and
watchdog_suspended variables and their relationship.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Reviewed-by: Aaron Tomlin <atomlin@redhat.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ulrich Obergfell <uobergfe@redhat.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Stephane Eranian <eranian@google.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c |  4 ++--
 include/linux/nmi.h                    |  8 ++++----
 kernel/watchdog.c                      | 26 ++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 144ab91951a7..cd9b6d0b10bf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3627,7 +3627,7 @@ static __init int fixup_ht_bug(void)
 		return 0;
 	}
 
-	if (watchdog_suspend() != 0) {
+	if (lockup_detector_suspend() != 0) {
 		pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n");
 		return 0;
 	}
@@ -3638,7 +3638,7 @@ static __init int fixup_ht_bug(void)
 	x86_pmu.commit_scheduling = NULL;
 	x86_pmu.stop_scheduling = NULL;
 
-	watchdog_resume();
+	lockup_detector_resume();
 
 	get_online_cpus();
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index e5afe8bae202..a91adf6e02f2 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -78,15 +78,15 @@ extern int proc_watchdog_thresh(struct ctl_table *, int ,
 				void __user *, size_t *, loff_t *);
 extern int proc_watchdog_cpumask(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
-extern int watchdog_suspend(void);
-extern void watchdog_resume(void);
+extern int lockup_detector_suspend(void);
+extern void lockup_detector_resume(void);
 #else
-static inline int watchdog_suspend(void)
+static inline int lockup_detector_suspend(void)
 {
 	return 0;
 }
 
-static inline void watchdog_resume(void)
+static inline void lockup_detector_resume(void)
 {
 }
 #endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 69666f4b8e8f..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -67,8 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
 #define for_each_watchdog_cpu(cpu) \
 	for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
 
-static int __read_mostly watchdog_suspended;
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
 static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -669,7 +687,7 @@ static void watchdog_unpark_threads(void)
 /*
  * Suspend the hard and soft lockup detector by parking the watchdog threads.
  */
-int watchdog_suspend(void)
+int lockup_detector_suspend(void)
 {
 	int ret = 0;
 
@@ -679,7 +697,7 @@ int watchdog_suspend(void)
 	 * the 'watchdog_suspended' variable). If the watchdog threads are
 	 * running, the first caller takes care that they will be parked.
 	 * The state of 'watchdog_running' cannot change while a suspend
-	 * request is active (see related changes in 'proc' handlers).
+	 * request is active (see related code in 'proc' handlers).
 	 */
 	if (watchdog_running && !watchdog_suspended)
 		ret = watchdog_park_threads();
@@ -695,7 +713,7 @@ int watchdog_suspend(void)
 /*
  * Resume the hard and soft lockup detector by unparking the watchdog threads.
  */
-void watchdog_resume(void)
+void lockup_detector_resume(void)
 {
 	mutex_lock(&watchdog_proc_mutex);
 
-- 
cgit v1.2.3


From 2ae44005b678431a5c7a55dafcd09421ba3fadf0 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 4 Sep 2015 15:45:31 -0700
Subject: slub: fix spelling succedd to succeed

With this patchset the SLUB allocator now has both bulk alloc and free
implemented.

This patchset mostly optimizes the "fastpath" where objects are available
on the per CPU fastpath page.  This mostly amortize the less-heavy
none-locked cmpxchg_double used on fastpath.

The "fallback" bulking (e.g __kmem_cache_free_bulk) provides a good basis
for comparison.  Measurements[1] of the fallback functions
__kmem_cache_{free,alloc}_bulk have been copied from slab_common.c and
forced "noinline" to force a function call like slab_common.c.

Measurements on CPU CPU i7-4790K @ 4.00GHz
Baseline normal fastpath (alloc+free cost): 42 cycles(tsc) 10.601 ns

Measurements last-patch with disabled debugging:

Bulk- fallback                   - this-patch
  1 -  57 cycles(tsc) 14.448 ns  -  44 cycles(tsc) 11.236 ns  improved 22.8%
  2 -  51 cycles(tsc) 12.768 ns  -  28 cycles(tsc)  7.019 ns  improved 45.1%
  3 -  48 cycles(tsc) 12.232 ns  -  22 cycles(tsc)  5.526 ns  improved 54.2%
  4 -  48 cycles(tsc) 12.025 ns  -  19 cycles(tsc)  4.786 ns  improved 60.4%
  8 -  46 cycles(tsc) 11.558 ns  -  18 cycles(tsc)  4.572 ns  improved 60.9%
 16 -  45 cycles(tsc) 11.458 ns  -  18 cycles(tsc)  4.658 ns  improved 60.0%
 30 -  45 cycles(tsc) 11.499 ns  -  18 cycles(tsc)  4.568 ns  improved 60.0%
 32 -  79 cycles(tsc) 19.917 ns  -  65 cycles(tsc) 16.454 ns  improved 17.7%
 34 -  78 cycles(tsc) 19.655 ns  -  63 cycles(tsc) 15.932 ns  improved 19.2%
 48 -  68 cycles(tsc) 17.049 ns  -  50 cycles(tsc) 12.506 ns  improved 26.5%
 64 -  80 cycles(tsc) 20.009 ns  -  63 cycles(tsc) 15.929 ns  improved 21.3%
128 -  94 cycles(tsc) 23.749 ns  -  86 cycles(tsc) 21.583 ns  improved  8.5%
158 -  97 cycles(tsc) 24.299 ns  -  90 cycles(tsc) 22.552 ns  improved  7.2%
250 - 102 cycles(tsc) 25.681 ns  -  98 cycles(tsc) 24.589 ns  improved  3.9%

Benchmarking shows impressive improvements in the "fastpath" with a small
number of objects in the working set.  Once the working set increases,
resulting in activating the "slowpath" (that contains the heavier locked
cmpxchg_double) the improvement decreases.

I'm currently working on also optimizing the "slowpath" (as network stack
use-case hits this), but this patchset should provide a good foundation
for further improvements.  Rest of my patch queue in this area needs some
more work, but preliminary results are good.  I'm attending Netfilter
Workshop[2] next week, and I'll hopefully return working on further
improvements in this area.

This patch (of 6):

s/succedd/succeed/

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index f68c0e50f3c0..defd76f98648 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2712,7 +2712,7 @@ redo:
 	 * Determine the currently cpus per cpu slab.
 	 * The cpu may change afterward. However that does not matter since
 	 * data is retrieved via this pointer. If we are on the same cpu
-	 * during the cmpxchg then the free will succedd.
+	 * during the cmpxchg then the free will succeed.
 	 */
 	do {
 		tid = this_cpu_read(s->cpu_slab->tid);
-- 
cgit v1.2.3


From 484748f0b65a1950b2b93f444a2287e8dd2cedd6 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 4 Sep 2015 15:45:34 -0700
Subject: slab: infrastructure for bulk object allocation and freeing

Add the basic infrastructure for alloc/free operations on pointer arrays.
It includes a generic function in the common slab code that is used in
this infrastructure patch to create the unoptimized functionality for slab
bulk operations.

Allocators can then provide optimized allocation functions for situations
in which large numbers of objects are needed.  These optimization may
avoid taking locks repeatedly and bypass metadata creation if all objects
in slab pages can be used to provide the objects required.

Allocators can extend the skeletons provided and add their own code to the
bulk alloc and free functions.  They can keep the generic allocation and
freeing and just fall back to those if optimizations would not work (like
for example when debugging is on).

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 10 ++++++++++
 mm/slab.c            | 13 +++++++++++++
 mm/slab.h            |  9 +++++++++
 mm/slab_common.c     | 23 +++++++++++++++++++++++
 mm/slob.c            | 13 +++++++++++++
 mm/slub.c            | 14 ++++++++++++++
 6 files changed, 82 insertions(+)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index a99f0e5243e1..7e37d448ed91 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -290,6 +290,16 @@ void *__kmalloc(size_t size, gfp_t flags);
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
 void kmem_cache_free(struct kmem_cache *, void *);
 
+/*
+ * Bulk allocation and freeing operations. These are accellerated in an
+ * allocator specific way to avoid taking locks repeatedly or building
+ * metadata structures unnecessarily.
+ *
+ * Note that interrupts must be enabled when calling these functions.
+ */
+void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
diff --git a/mm/slab.c b/mm/slab.c
index bbd0b47dc6a9..60c936938b84 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+	__kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+								void **p)
+{
+	return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
 #ifdef CONFIG_TRACING
 void *
 kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
diff --git a/mm/slab.h b/mm/slab.h
index 8da63e4e470f..88b55497738c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 		       size_t count, loff_t *ppos);
 
+/*
+ * Generic implementation of bulk operations
+ * These are useful for situations in which the allocator cannot
+ * perform optimizations. In that case segments of the objecct listed
+ * may be allocated or freed using these operations.
+ */
+void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
+bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * Iterate over all memcg caches of the given root cache. The caller must hold
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 86831105a09f..c26829fe4e37 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -104,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 }
 #endif
 
+void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
+{
+	size_t i;
+
+	for (i = 0; i < nr; i++)
+		kmem_cache_free(s, p[i]);
+}
+
+bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+								void **p)
+{
+	size_t i;
+
+	for (i = 0; i < nr; i++) {
+		void *x = p[i] = kmem_cache_alloc(s, flags);
+		if (!x) {
+			__kmem_cache_free_bulk(s, i, p);
+			return false;
+		}
+	}
+	return true;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 void slab_init_memcg_params(struct kmem_cache *s)
 {
diff --git a/mm/slob.c b/mm/slob.c
index 4765f65019c7..165bbd3cd606 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+	__kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+								void **p)
+{
+	return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
 int __kmem_cache_shutdown(struct kmem_cache *c)
 {
 	/* No way to check for remaining objects */
diff --git a/mm/slub.c b/mm/slub.c
index defd76f98648..3ca89ef9b7b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2750,6 +2750,20 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+	__kmem_cache_free_bulk(s, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
+bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+								void **p)
+{
+	return __kmem_cache_alloc_bulk(s, flags, size, p);
+}
+EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+
+
 /*
  * Object placement in a slab is made very easy because we always start at
  * offset 0. If we tune the size of the object to the alignment then we can
-- 
cgit v1.2.3


From 994eb764ec5ad57c9b7c5e72b892205039a84b69 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 4 Sep 2015 15:45:37 -0700
Subject: slub bulk alloc: extract objects from the per cpu slab

First piece: acceleration of retrieval of per cpu objects

If we are allocating lots of objects then it is advantageous to disable
interrupts and avoid the this_cpu_cmpxchg() operation to get these objects
faster.

Note that we cannot do the fast operation if debugging is enabled, because
we would have to add extra code to do all the debugging checks.  And it
would not be fast anyway.

Note also that the requirement of having interrupts disabled avoids having
to do processor flag operations.

Allocate as many objects as possible in the fast way and then fall back to
the generic implementation for the rest of the objects.

Measurements on CPU CPU i7-4790K @ 4.00GHz
Baseline normal fastpath (alloc+free cost): 42 cycles(tsc) 10.554 ns

Bulk- fallback                   - this-patch
  1 -  57 cycles(tsc) 14.432 ns  -  48 cycles(tsc) 12.155 ns  improved 15.8%
  2 -  50 cycles(tsc) 12.746 ns  -  37 cycles(tsc)  9.390 ns  improved 26.0%
  3 -  48 cycles(tsc) 12.180 ns  -  33 cycles(tsc)  8.417 ns  improved 31.2%
  4 -  48 cycles(tsc) 12.015 ns  -  32 cycles(tsc)  8.045 ns  improved 33.3%
  8 -  46 cycles(tsc) 11.526 ns  -  30 cycles(tsc)  7.699 ns  improved 34.8%
 16 -  45 cycles(tsc) 11.418 ns  -  32 cycles(tsc)  8.205 ns  improved 28.9%
 30 -  80 cycles(tsc) 20.246 ns  -  73 cycles(tsc) 18.328 ns  improved  8.8%
 32 -  79 cycles(tsc) 19.946 ns  -  72 cycles(tsc) 18.208 ns  improved  8.9%
 34 -  78 cycles(tsc) 19.659 ns  -  71 cycles(tsc) 17.987 ns  improved  9.0%
 48 -  86 cycles(tsc) 21.516 ns  -  82 cycles(tsc) 20.566 ns  improved  4.7%
 64 -  93 cycles(tsc) 23.423 ns  -  89 cycles(tsc) 22.480 ns  improved  4.3%
128 - 100 cycles(tsc) 25.170 ns  -  99 cycles(tsc) 24.871 ns  improved  1.0%
158 - 102 cycles(tsc) 25.549 ns  - 101 cycles(tsc) 25.375 ns  improved  1.0%
250 - 101 cycles(tsc) 25.344 ns  - 100 cycles(tsc) 25.182 ns  improved  1.0%

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3ca89ef9b7b0..30e7dedec664 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2750,16 +2750,61 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
+/* Note that interrupts must be enabled when calling this function. */
 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 {
 	__kmem_cache_free_bulk(s, size, p);
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
+/* Note that interrupts must be enabled when calling this function. */
 bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
-								void **p)
+			   void **p)
 {
-	return __kmem_cache_alloc_bulk(s, flags, size, p);
+	struct kmem_cache_cpu *c;
+	int i;
+
+	/* Debugging fallback to generic bulk */
+	if (kmem_cache_debug(s))
+		return __kmem_cache_alloc_bulk(s, flags, size, p);
+
+	/*
+	 * Drain objects in the per cpu slab, while disabling local
+	 * IRQs, which protects against PREEMPT and interrupts
+	 * handlers invoking normal fastpath.
+	 */
+	local_irq_disable();
+	c = this_cpu_ptr(s->cpu_slab);
+
+	for (i = 0; i < size; i++) {
+		void *object = c->freelist;
+
+		if (!object)
+			break;
+
+		c->freelist = get_freepointer(s, object);
+		p[i] = object;
+	}
+	c->tid = next_tid(c->tid);
+	local_irq_enable();
+
+	/* Clear memory outside IRQ disabled fastpath loop */
+	if (unlikely(flags & __GFP_ZERO)) {
+		int j;
+
+		for (j = 0; j < i; j++)
+			memset(p[j], 0, s->object_size);
+	}
+
+	/* Fallback to single elem alloc */
+	for (; i < size; i++) {
+		void *x = p[i] = kmem_cache_alloc(s, flags);
+		if (unlikely(!x)) {
+			__kmem_cache_free_bulk(s, i, p);
+			return false;
+		}
+	}
+	return true;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
 
-- 
cgit v1.2.3


From ebe909e0fdb34b980c5cf636c495e4f0bb0dfda8 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 4 Sep 2015 15:45:40 -0700
Subject: slub: improve bulk alloc strategy

Call slowpath __slab_alloc() from within the bulk loop, as the side-effect
of this call likely repopulates c->freelist.

Choose to reenable local IRQs while calling slowpath.

Saving some optimizations for later.  E.g.  it is possible to extract
parts of __slab_alloc() and avoid the unnecessary and expensive (37
cycles) local_irq_{save,restore}.  For now, be happy calling
__slab_alloc() this lower icache impact of this func and I don't have to
worry about correctness.

Measurements on CPU CPU i7-4790K @ 4.00GHz
Baseline normal fastpath (alloc+free cost): 42 cycles(tsc) 10.601 ns

Bulk- fallback                   - this-patch
  1 -  58 cycles(tsc) 14.516 ns  -  49 cycles(tsc) 12.459 ns  improved 15.5%
  2 -  51 cycles(tsc) 12.930 ns  -  38 cycles(tsc)  9.605 ns  improved 25.5%
  3 -  49 cycles(tsc) 12.274 ns  -  34 cycles(tsc)  8.525 ns  improved 30.6%
  4 -  48 cycles(tsc) 12.058 ns  -  32 cycles(tsc)  8.036 ns  improved 33.3%
  8 -  46 cycles(tsc) 11.609 ns  -  31 cycles(tsc)  7.756 ns  improved 32.6%
 16 -  45 cycles(tsc) 11.451 ns  -  32 cycles(tsc)  8.148 ns  improved 28.9%
 30 -  79 cycles(tsc) 19.865 ns  -  68 cycles(tsc) 17.164 ns  improved 13.9%
 32 -  76 cycles(tsc) 19.212 ns  -  66 cycles(tsc) 16.584 ns  improved 13.2%
 34 -  74 cycles(tsc) 18.600 ns  -  63 cycles(tsc) 15.954 ns  improved 14.9%
 48 -  88 cycles(tsc) 22.092 ns  -  77 cycles(tsc) 19.373 ns  improved 12.5%
 64 -  80 cycles(tsc) 20.043 ns  -  68 cycles(tsc) 17.188 ns  improved 15.0%
128 -  99 cycles(tsc) 24.818 ns  -  89 cycles(tsc) 22.404 ns  improved 10.1%
158 -  99 cycles(tsc) 24.977 ns  -  92 cycles(tsc) 23.089 ns  improved  7.1%
250 - 106 cycles(tsc) 26.552 ns  -  99 cycles(tsc) 24.785 ns  improved  6.6%

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 30e7dedec664..f98b1b9e7080 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2779,8 +2779,22 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	for (i = 0; i < size; i++) {
 		void *object = c->freelist;
 
-		if (!object)
-			break;
+		if (unlikely(!object)) {
+			local_irq_enable();
+			/*
+			 * Invoking slow path likely have side-effect
+			 * of re-populating per CPU c->freelist
+			 */
+			p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+					    _RET_IP_, c);
+			if (unlikely(!p[i])) {
+				__kmem_cache_free_bulk(s, i, p);
+				return false;
+			}
+			local_irq_disable();
+			c = this_cpu_ptr(s->cpu_slab);
+			continue; /* goto for-loop */
+		}
 
 		c->freelist = get_freepointer(s, object);
 		p[i] = object;
@@ -2796,14 +2810,6 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			memset(p[j], 0, s->object_size);
 	}
 
-	/* Fallback to single elem alloc */
-	for (; i < size; i++) {
-		void *x = p[i] = kmem_cache_alloc(s, flags);
-		if (unlikely(!x)) {
-			__kmem_cache_free_bulk(s, i, p);
-			return false;
-		}
-	}
 	return true;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_bulk);
-- 
cgit v1.2.3


From fbd02630c6e3c60feecc4688f5f98b015d264516 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 4 Sep 2015 15:45:43 -0700
Subject: slub: initial bulk free implementation

This implements SLUB specific kmem_cache_free_bulk().  SLUB allocator now
both have bulk alloc and free implemented.

Choose to reenable local IRQs while calling slowpath __slab_free().  In
worst case, where all objects hit slowpath call, the performance should
still be faster than fallback function __kmem_cache_free_bulk(), because
local_irq_{disable+enable} is very fast (7-cycles), while the fallback
invokes this_cpu_cmpxchg() which is slightly slower (9-cycles).
Nitpicking, this should be faster for N>=4, due to the entry cost of
local_irq_{disable+enable}.

Do notice that the save+restore variant is very expensive, this is key to
why this optimization works.

CPU: i7-4790K CPU @ 4.00GHz
 * local_irq_{disable,enable}:  7 cycles(tsc) - 1.821 ns
 * local_irq_{save,restore}  : 37 cycles(tsc) - 9.443 ns

Measurements on CPU CPU i7-4790K @ 4.00GHz
Baseline normal fastpath (alloc+free cost): 43 cycles(tsc) 10.834 ns

Bulk- fallback                   - this-patch
  1 -  58 cycles(tsc) 14.542 ns  -  43 cycles(tsc) 10.811 ns  improved 25.9%
  2 -  50 cycles(tsc) 12.659 ns  -  27 cycles(tsc)  6.867 ns  improved 46.0%
  3 -  48 cycles(tsc) 12.168 ns  -  21 cycles(tsc)  5.496 ns  improved 56.2%
  4 -  47 cycles(tsc) 11.987 ns  -  24 cycles(tsc)  6.038 ns  improved 48.9%
  8 -  46 cycles(tsc) 11.518 ns  -  17 cycles(tsc)  4.280 ns  improved 63.0%
 16 -  45 cycles(tsc) 11.366 ns  -  17 cycles(tsc)  4.483 ns  improved 62.2%
 30 -  45 cycles(tsc) 11.433 ns  -  18 cycles(tsc)  4.531 ns  improved 60.0%
 32 -  75 cycles(tsc) 18.983 ns  -  58 cycles(tsc) 14.586 ns  improved 22.7%
 34 -  71 cycles(tsc) 17.940 ns  -  53 cycles(tsc) 13.391 ns  improved 25.4%
 48 -  80 cycles(tsc) 20.077 ns  -  65 cycles(tsc) 16.268 ns  improved 18.8%
 64 -  71 cycles(tsc) 17.799 ns  -  53 cycles(tsc) 13.440 ns  improved 25.4%
128 -  91 cycles(tsc) 22.980 ns  -  79 cycles(tsc) 19.899 ns  improved 13.2%
158 - 100 cycles(tsc) 25.241 ns  -  90 cycles(tsc) 22.732 ns  improved 10.0%
250 - 102 cycles(tsc) 25.583 ns  -  95 cycles(tsc) 23.916 ns  improved  6.9%

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index f98b1b9e7080..2bd3d12ae106 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2753,7 +2753,39 @@ EXPORT_SYMBOL(kmem_cache_free);
 /* Note that interrupts must be enabled when calling this function. */
 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 {
-	__kmem_cache_free_bulk(s, size, p);
+	struct kmem_cache_cpu *c;
+	struct page *page;
+	int i;
+
+	/* Debugging fallback to generic bulk */
+	if (kmem_cache_debug(s))
+		return __kmem_cache_free_bulk(s, size, p);
+
+	local_irq_disable();
+	c = this_cpu_ptr(s->cpu_slab);
+
+	for (i = 0; i < size; i++) {
+		void *object = p[i];
+
+		BUG_ON(!object);
+		page = virt_to_head_page(object);
+		BUG_ON(s != page->slab_cache); /* Check if valid slab page */
+
+		if (c->page == page) {
+			/* Fastpath: local CPU free */
+			set_freepointer(s, object, c->freelist);
+			c->freelist = object;
+		} else {
+			c->tid = next_tid(c->tid);
+			local_irq_enable();
+			/* Slowpath: overhead locked cmpxchg_double_slab */
+			__slab_free(s, page, object, _RET_IP_);
+			local_irq_disable();
+			c = this_cpu_ptr(s->cpu_slab);
+		}
+	}
+	c->tid = next_tid(c->tid);
+	local_irq_enable();
 }
 EXPORT_SYMBOL(kmem_cache_free_bulk);
 
-- 
cgit v1.2.3


From 3eed034d045ce93a40e232a6bd5f86127342053a Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Fri, 4 Sep 2015 15:45:45 -0700
Subject: slub: add support for kmem_cache_debug in bulk calls

Per request of Joonsoo Kim adding kmem debug support.

I've tested that when debugging is disabled, then there is almost no
performance impact as this code basically gets removed by the compiler.

Need some guidance in enabling and testing this.

bulk- PREVIOUS                  - THIS-PATCH
  1 -  43 cycles(tsc) 10.811 ns -  44 cycles(tsc) 11.236 ns  improved  -2.3%
  2 -  27 cycles(tsc)  6.867 ns -  28 cycles(tsc)  7.019 ns  improved  -3.7%
  3 -  21 cycles(tsc)  5.496 ns -  22 cycles(tsc)  5.526 ns  improved  -4.8%
  4 -  24 cycles(tsc)  6.038 ns -  19 cycles(tsc)  4.786 ns  improved  20.8%
  8 -  17 cycles(tsc)  4.280 ns -  18 cycles(tsc)  4.572 ns  improved  -5.9%
 16 -  17 cycles(tsc)  4.483 ns -  18 cycles(tsc)  4.658 ns  improved  -5.9%
 30 -  18 cycles(tsc)  4.531 ns -  18 cycles(tsc)  4.568 ns  improved   0.0%
 32 -  58 cycles(tsc) 14.586 ns -  65 cycles(tsc) 16.454 ns  improved -12.1%
 34 -  53 cycles(tsc) 13.391 ns -  63 cycles(tsc) 15.932 ns  improved -18.9%
 48 -  65 cycles(tsc) 16.268 ns -  50 cycles(tsc) 12.506 ns  improved  23.1%
 64 -  53 cycles(tsc) 13.440 ns -  63 cycles(tsc) 15.929 ns  improved -18.9%
128 -  79 cycles(tsc) 19.899 ns -  86 cycles(tsc) 21.583 ns  improved  -8.9%
158 -  90 cycles(tsc) 22.732 ns -  90 cycles(tsc) 22.552 ns  improved   0.0%
250 -  95 cycles(tsc) 23.916 ns -  98 cycles(tsc) 24.589 ns  improved  -3.2%

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 2bd3d12ae106..48bdb4e5a985 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2757,10 +2757,6 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 	struct page *page;
 	int i;
 
-	/* Debugging fallback to generic bulk */
-	if (kmem_cache_debug(s))
-		return __kmem_cache_free_bulk(s, size, p);
-
 	local_irq_disable();
 	c = this_cpu_ptr(s->cpu_slab);
 
@@ -2768,8 +2764,13 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 		void *object = p[i];
 
 		BUG_ON(!object);
+		/* kmem cache debug support */
+		s = cache_from_obj(s, object);
+		if (unlikely(!s))
+			goto exit;
+		slab_free_hook(s, object);
+
 		page = virt_to_head_page(object);
-		BUG_ON(s != page->slab_cache); /* Check if valid slab page */
 
 		if (c->page == page) {
 			/* Fastpath: local CPU free */
@@ -2784,6 +2785,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
 			c = this_cpu_ptr(s->cpu_slab);
 		}
 	}
+exit:
 	c->tid = next_tid(c->tid);
 	local_irq_enable();
 }
@@ -2796,10 +2798,6 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	struct kmem_cache_cpu *c;
 	int i;
 
-	/* Debugging fallback to generic bulk */
-	if (kmem_cache_debug(s))
-		return __kmem_cache_alloc_bulk(s, flags, size, p);
-
 	/*
 	 * Drain objects in the per cpu slab, while disabling local
 	 * IRQs, which protects against PREEMPT and interrupts
@@ -2828,8 +2826,20 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			continue; /* goto for-loop */
 		}
 
+		/* kmem_cache debug support */
+		s = slab_pre_alloc_hook(s, flags);
+		if (unlikely(!s)) {
+			__kmem_cache_free_bulk(s, i, p);
+			c->tid = next_tid(c->tid);
+			local_irq_enable();
+			return false;
+		}
+
 		c->freelist = get_freepointer(s, object);
 		p[i] = object;
+
+		/* kmem_cache debug support */
+		slab_post_alloc_hook(s, flags, object);
 	}
 	c->tid = next_tid(c->tid);
 	local_irq_enable();
-- 
cgit v1.2.3


From 588f8ba913d35a667407afa29444bf2f94e03d19 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 4 Sep 2015 15:45:48 -0700
Subject: mm/slub: move slab initialization into irq enabled region

Initializing a new slab can introduce rather large latencies because most
of the initialization runs always with interrupts disabled.

There is no point in doing so.  The newly allocated slab is not visible
yet, so there is no reason to protect it against concurrent alloc/free.

Move the expensive parts of the initialization into allocate_slab(), so
for all allocations with GFP_WAIT set, interrupts are enabled.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 89 ++++++++++++++++++++++++++++++---------------------------------
 1 file changed, 42 insertions(+), 47 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 48bdb4e5a985..3efcdc02082c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 	kasan_slab_free(s, x);
 }
 
+static void setup_object(struct kmem_cache *s, struct page *page,
+				void *object)
+{
+	setup_object_debug(s, page, object);
+	if (unlikely(s->ctor)) {
+		kasan_unpoison_object_data(s, object);
+		s->ctor(object);
+		kasan_poison_object_data(s, object);
+	}
+}
+
 /*
  * Slab allocation and freeing
  */
@@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	struct page *page;
 	struct kmem_cache_order_objects oo = s->oo;
 	gfp_t alloc_gfp;
+	void *start, *p;
+	int idx, order;
 
 	flags &= gfp_allowed_mask;
 
@@ -1359,13 +1372,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		 * Try a lower order alloc if possible
 		 */
 		page = alloc_slab_page(s, alloc_gfp, node, oo);
-
-		if (page)
-			stat(s, ORDER_FALLBACK);
+		if (unlikely(!page))
+			goto out;
+		stat(s, ORDER_FALLBACK);
 	}
 
-	if (kmemcheck_enabled && page
-		&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+	if (kmemcheck_enabled &&
+	    !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
 		int pages = 1 << oo_order(oo);
 
 		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1380,51 +1393,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 			kmemcheck_mark_unallocated_pages(page, pages);
 	}
 
-	if (flags & __GFP_WAIT)
-		local_irq_disable();
-	if (!page)
-		return NULL;
-
 	page->objects = oo_objects(oo);
-	mod_zone_page_state(page_zone(page),
-		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-		1 << oo_order(oo));
-
-	return page;
-}
-
-static void setup_object(struct kmem_cache *s, struct page *page,
-				void *object)
-{
-	setup_object_debug(s, page, object);
-	if (unlikely(s->ctor)) {
-		kasan_unpoison_object_data(s, object);
-		s->ctor(object);
-		kasan_poison_object_data(s, object);
-	}
-}
-
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
-{
-	struct page *page;
-	void *start;
-	void *p;
-	int order;
-	int idx;
-
-	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
-		pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
-		BUG();
-	}
-
-	page = allocate_slab(s,
-		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-	if (!page)
-		goto out;
 
 	order = compound_order(page);
-	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab_cache = s;
 	__SetPageSlab(page);
 	if (page_is_pfmemalloc(page))
@@ -1448,10 +1419,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	page->freelist = start;
 	page->inuse = page->objects;
 	page->frozen = 1;
+
 out:
+	if (flags & __GFP_WAIT)
+		local_irq_disable();
+	if (!page)
+		return NULL;
+
+	mod_zone_page_state(page_zone(page),
+		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
+		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+		1 << oo_order(oo));
+
+	inc_slabs_node(s, page_to_nid(page), page->objects);
+
 	return page;
 }
 
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+	if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+		pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+		BUG();
+	}
+
+	return allocate_slab(s,
+		flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
+
 static void __free_slab(struct kmem_cache *s, struct page *page)
 {
 	int order = compound_order(page);
-- 
cgit v1.2.3


From 80da026a8e5da83b6ab029807844587960490e2b Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Fri, 4 Sep 2015 15:45:51 -0700
Subject: mm/slub: fix slab double-free in case of duplicate sysfs filename

sysfs_slab_add() shouldn't call kobject_put at error path: this puts last
reference of kmem-cache kobject and frees it.  Kmem cache will be freed
second time at error path in kmem_cache_create().

For example this happens when slub debug was enabled in runtime and
somebody creates new kmem cache:

# echo 1 | tee /sys/kernel/slab/*/sanity_checks
# modprobe configfs

"configfs_dir_cache" cannot be merged because existing slab have debug and
cannot create new slab because unique name ":t-0000096" already taken.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3efcdc02082c..7e9e508263fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5283,7 +5283,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 	s->kobj.kset = cache_kset(s);
 	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
 	if (err)
-		goto out_put_kobj;
+		goto out;
 
 	err = sysfs_create_group(&s->kobj, &slab_attr_group);
 	if (err)
@@ -5310,8 +5310,6 @@ out:
 	return err;
 out_del_kobj:
 	kobject_del(&s->kobj);
-out_put_kobj:
-	kobject_put(&s->kobj);
 	goto out;
 }
 
-- 
cgit v1.2.3


From 45eb00cd3a034b8448f52fd9074e9b2b11d857c1 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Fri, 4 Sep 2015 15:45:54 -0700
Subject: mm/slub: don't wait for high-order page allocation

Description is almost copied from commit fb05e7a89f50 ("net: don't wait
for order-3 page allocation").

I saw excessive direct memory reclaim/compaction triggered by slub.  This
causes performance issues and add latency.  Slub uses high-order
allocation to reduce internal fragmentation and management overhead.  But,
direct memory reclaim/compaction has high overhead and the benefit of
high-order allocation can't compensate the overhead of both work.

This patch makes auxiliary high-order allocation atomic.  If there is no
memory pressure and memory isn't fragmented, the alloction will still
success, so we don't sacrifice high-order allocation's benefit here.  If
the atomic allocation fails, direct memory reclaim/compaction will not be
triggered, allocation fallback to low-order immediately, hence the direct
memory reclaim/compaction overhead is avoided.  In the allocation failure
case, kswapd is waken up and trying to make high-order freepages, so
allocation could success next time.

Following is the test to measure effect of this patch.

System: QEMU, CPU 8, 512 MB
Mem: 25% memory is allocated at random position to make fragmentation.
 Memory-hogger occupies 150 MB memory.
Workload: hackbench -g 20 -l 1000

Average result by 10 runs (Base va Patched)

elapsed_time(s): 4.3468 vs 2.9838
compact_stall: 461.7 vs 73.6
pgmigrate_success: 28315.9 vs 7256.1

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Shaohua Li <shli@fb.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/slub.c b/mm/slub.c
index 7e9e508263fb..084184e706c6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1362,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	 * so we fall-back to the minimum order allocation.
 	 */
 	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
+	if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
+		alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
 
 	page = alloc_slab_page(s, alloc_gfp, node, oo);
 	if (unlikely(!page)) {
-- 
cgit v1.2.3


From 2d16e0fd3265b42648d267b7adb837db9798edaf Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Sep 2015 15:45:57 -0700
Subject: mm/slab.h: fix argument order in cache_from_obj's error message

While debugging a networking issue, I hit a condition that triggered an
object to be freed into the wrong kmem cache, and thus triggered the
warning in cache_from_obj().

The arguments in the error message are in wrong order: the location
of the object's kmem cache is in cachep, not s.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slab.h b/mm/slab.h
index 88b55497738c..a3a967d7d7c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -330,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 		return cachep;
 
 	pr_err("%s: Wrong slab cache. %s but object is from %s\n",
-	       __func__, cachep->name, s->name);
+	       __func__, s->name, cachep->name);
 	WARN_ON_ONCE(1);
 	return s;
 }
-- 
cgit v1.2.3


From 25edd8bffd0f7563f0c04c1d219eb89061ce9886 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:00 -0700
Subject: userfaultfd: linux/Documentation/vm/userfaultfd.txt

This is the latest userfaultfd patchset.  The postcopy live migration
feature on the qemu side is mostly ready to be merged and it entirely
depends on the userfaultfd syscall to be merged as well.  So it'd be great
if this patchset could be reviewed for merging in -mm.

Userfaults allow to implement on demand paging from userland and more
generally they allow userland to more efficiently take control of the
behavior of page faults than what was available before (PROT_NONE +
SIGSEGV trap).

The use cases are:

1) KVM postcopy live migration (one form of cloud memory
   externalization).

   KVM postcopy live migration is the primary driver of this work:

    http://blog.zhaw.ch/icclab/setting-up-post-copy-live-migration-in-openstack/
    http://lists.gnu.org/archive/html/qemu-devel/2015-02/msg04873.html

2) postcopy live migration of binaries inside linux containers:

    http://thread.gmane.org/gmane.linux.kernel.mm/132662

3) KVM postcopy live snapshotting (allowing to limit/throttle the
   memory usage, unlike fork would, plus the avoidance of fork
   overhead in the first place).

   While the wrprotect tracking is not implemented yet, the syscall API is
   already contemplating the wrprotect fault tracking and it's generic enough
   to allow its later implementation in a backwards compatible fashion.

4) KVM userfaults on shared memory. The UFFDIO_COPY lowlevel method
   should be extended to work also on tmpfs and then the
   uffdio_register.ioctls will notify userland that UFFDIO_COPY is
   available even when the registered virtual memory range is tmpfs
   backed.

5) alternate mechanism to notify web browsers or apps on embedded
   devices that volatile pages have been reclaimed. This basically
   avoids the need to run a syscall before the app can access with the
   CPU the virtual regions marked volatile. This depends on point 4)
   to be fulfilled first, as volatile pages happily apply to tmpfs.

Even though there wasn't a real use case requesting it yet, it also
allows to implement distributed shared memory in a way that readonly
shared mappings can exist simultaneously in different hosts and they
can be become exclusive at the first wrprotect fault.

This patch (of 22):

Add documentation.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/userfaultfd.txt | 142 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 Documentation/vm/userfaultfd.txt

diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
new file mode 100644
index 000000000000..90912925425e
--- /dev/null
+++ b/Documentation/vm/userfaultfd.txt
@@ -0,0 +1,142 @@
+= Userfaultfd =
+
+== Objective ==
+
+Userfaults allow the implementation of on-demand paging from userland
+and more generally they allow userland to take control of various
+memory page faults, something otherwise only the kernel code could do.
+
+For example userfaults allows a proper and more optimal implementation
+of the PROT_NONE+SIGSEGV trick.
+
+== Design ==
+
+Userfaults are delivered and resolved through the userfaultfd syscall.
+
+The userfaultfd (aside from registering and unregistering virtual
+memory ranges) provides two primary functionalities:
+
+1) read/POLLIN protocol to notify a userland thread of the faults
+   happening
+
+2) various UFFDIO_* ioctls that can manage the virtual memory regions
+   registered in the userfaultfd that allows userland to efficiently
+   resolve the userfaults it receives via 1) or to manage the virtual
+   memory in the background
+
+The real advantage of userfaults if compared to regular virtual memory
+management of mremap/mprotect is that the userfaults in all their
+operations never involve heavyweight structures like vmas (in fact the
+userfaultfd runtime load never takes the mmap_sem for writing).
+
+Vmas are not suitable for page- (or hugepage) granular fault tracking
+when dealing with virtual address spaces that could span
+Terabytes. Too many vmas would be needed for that.
+
+The userfaultfd once opened by invoking the syscall, can also be
+passed using unix domain sockets to a manager process, so the same
+manager process could handle the userfaults of a multitude of
+different processes without them being aware about what is going on
+(well of course unless they later try to use the userfaultfd
+themselves on the same region the manager is already tracking, which
+is a corner case that would currently return -EBUSY).
+
+== API ==
+
+When first opened the userfaultfd must be enabled invoking the
+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
+a later API version) which will specify the read/POLLIN protocol
+userland intends to speak on the UFFD. The UFFDIO_API ioctl if
+successful (i.e. if the requested uffdio_api.api is spoken also by the
+running kernel), will return into uffdio_api.features and
+uffdio_api.ioctls two 64bit bitmasks of respectively the activated
+feature of the read(2) protocol and the generic ioctl available.
+
+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
+register a memory range in the userfaultfd by setting the
+uffdio_register structure accordingly. The uffdio_register.mode
+bitmask will specify to the kernel which kind of faults to track for
+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
+pages). The UFFDIO_REGISTER ioctl will return the
+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
+userfaults on the range registered. Not all ioctls will necessarily be
+supported for all memory types depending on the underlying virtual
+memory backend (anonymous memory vs tmpfs vs real filebacked
+mappings).
+
+Userland can use the uffdio_register.ioctls to manage the virtual
+address space in the background (to add or potentially also remove
+memory from the userfaultfd registered range). This means a userfault
+could be triggering just before userland maps in the background the
+user-faulted page.
+
+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
+atomically copies a page into the userfault registered range and wakes
+up the blocked userfaults (unless uffdio_copy.mode &
+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
+half copied page since it'll keep userfaulting until the copy has
+finished.
+
+== QEMU/KVM ==
+
+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
+migration. Postcopy live migration is one form of memory
+externalization consisting of a virtual machine running with part or
+all of its memory residing on a different node in the cloud. The
+userfaultfd abstraction is generic enough that not a single line of
+KVM kernel code had to be modified in order to add postcopy live
+migration to QEMU.
+
+Guest async page faults, FOLL_NOWAIT and all other GUP features work
+just fine in combination with userfaults. Userfaults trigger async
+page faults in the guest scheduler so those guest processes that
+aren't waiting for userfaults (i.e. network bound) can keep running in
+the guest vcpus.
+
+It is generally beneficial to run one pass of precopy live migration
+just before starting postcopy live migration, in order to avoid
+generating userfaults for readonly guest regions.
+
+The implementation of postcopy live migration currently uses one
+single bidirectional socket but in the future two different sockets
+will be used (to reduce the latency of the userfaults to the minimum
+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
+
+The QEMU in the source node writes all pages that it knows are missing
+in the destination node, into the socket, and the migration thread of
+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
+ioctls on the userfaultfd in order to map the received pages into the
+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
+
+A different postcopy thread in the destination node listens with
+poll() to the userfaultfd in parallel. When a POLLIN event is
+generated after a userfault triggers, the postcopy thread read() from
+the userfaultfd and receives the fault address (or -EAGAIN in case the
+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
+by the parallel QEMU migration thread).
+
+After the QEMU postcopy thread (running in the destination node) gets
+the userfault address it writes the information about the missing page
+into the socket. The QEMU source node receives the information and
+roughly "seeks" to that page address and continues sending all
+remaining missing pages from that new page offset. Soon after that
+(just the time to flush the tcp_wmem queue through the network) the
+migration thread in the QEMU running in the destination node will
+receive the page that triggered the userfault and it'll map it as
+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
+was spontaneously sent by the source or if it was an urgent page
+requested through an userfault).
+
+By the time the userfaults start, the QEMU in the destination node
+doesn't need to keep any per-page state bitmap relative to the live
+migration around and a single per-page bitmap has to be maintained in
+the QEMU running in the source node to know which pages are still
+missing in the destination node. The bitmap in the source node is
+checked to find which missing pages to send in round robin and we seek
+over it when receiving incoming userfaults. After sending each page of
+course the bitmap is updated accordingly. It's also useful to avoid
+sending the same page twice (in case the userfault is read by the
+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
+thread).
-- 
cgit v1.2.3


From 51360155eccb907ff8635bd10fc7de876408c2e0 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:04 -0700
Subject: userfaultfd: waitqueue: add nr wake parameter to __wake_up_locked_key

userfaultfd needs to wake all waitqueues (pass 0 as nr parameter), instead
of the current hardcoded 1 (that would wake just the first waitqueue in
the head list).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 5 +++--
 kernel/sched/wait.c  | 7 ++++---
 net/sunrpc/sched.c   | 2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 1e1bf9f963a9..d3d077228d4c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -147,7 +147,8 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old)
 
 typedef int wait_bit_action_f(struct wait_bit_key *);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			  void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -179,7 +180,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 #define wake_up_poll(x, m)						\
 	__wake_up(x, TASK_NORMAL, 1, (void *) (m))
 #define wake_up_locked_poll(x, m)					\
-	__wake_up_locked_key((x), TASK_NORMAL, (void *) (m))
+	__wake_up_locked_key((x), TASK_NORMAL, 1, (void *) (m))
 #define wake_up_interruptible_poll(x, m)				\
 	__wake_up(x, TASK_INTERRUPTIBLE, 1, (void *) (m))
 #define wake_up_interruptible_sync_poll(x, m)				\
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			  void *key)
 {
-	__wake_up_common(q, mode, 1, 0, key);
+	__wake_up_common(q, mode, nr, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, key);
+		__wake_up_locked_key(q, mode, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 337ca851a350..b140c092d226 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)
 	clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
 	ret = atomic_dec_and_test(&task->tk_count);
 	if (waitqueue_active(wq))
-		__wake_up_locked_key(wq, TASK_NORMAL, &k);
+		__wake_up_locked_key(wq, TASK_NORMAL, 1, &k);
 	spin_unlock_irqrestore(&wq->lock, flags);
 	return ret;
 }
-- 
cgit v1.2.3


From 1038628d80e96e3a086189172d9be8eb85ecfabf Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:04 -0700
Subject: userfaultfd: uAPI

Defines the uAPI of the userfaultfd, notably the ioctl numbers and protocol.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/ioctl/ioctl-number.txt |  1 +
 include/uapi/linux/Kbuild            |  1 +
 include/uapi/linux/userfaultfd.h     | 83 ++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 include/uapi/linux/userfaultfd.h

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 64df08db4657..39ac6546d4a4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -303,6 +303,7 @@ Code  Seq#(hex)	Include File		Comments
 0xA3	80-8F	Port ACL		in development:
 					<mailto:tlewis@mindspring.com>
 0xA3	90-9F	linux/dtlk.h
+0xAA	00-3F	linux/uapi/linux/userfaultfd.h
 0xAB	00-1F	linux/nbd.h
 0xAC	00-1F	linux/raw.h
 0xAD	00	Netfilter device	in development:
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index aafb9937b162..70ff1d9abf0d 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -456,3 +456,4 @@ header-y += xfrm.h
 header-y += xilinx-v4l2-controls.h
 header-y += zorro.h
 header-y += zorro_ids.h
+header-y += userfaultfd.h
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
new file mode 100644
index 000000000000..09c2e2a8c9d6
--- /dev/null
+++ b/include/uapi/linux/userfaultfd.h
@@ -0,0 +1,83 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#define UFFD_API ((__u64)0xAA)
+/* FIXME: add "|UFFD_BIT_WP" to UFFD_API_BITS after implementing it */
+#define UFFD_API_BITS (UFFD_BIT_WRITE)
+#define UFFD_API_IOCTLS				\
+	((__u64)1 << _UFFDIO_REGISTER |		\
+	 (__u64)1 << _UFFDIO_UNREGISTER |	\
+	 (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS			\
+	((__u64)1 << _UFFDIO_WAKE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER		(0x00)
+#define _UFFDIO_UNREGISTER		(0x01)
+#define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_API			(0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API		_IOWR(UFFDIO, _UFFDIO_API,	\
+				      struct uffdio_api)
+#define UFFDIO_REGISTER		_IOWR(UFFDIO, _UFFDIO_REGISTER, \
+				      struct uffdio_register)
+#define UFFDIO_UNREGISTER	_IOR(UFFDIO, _UFFDIO_UNREGISTER,	\
+				     struct uffdio_range)
+#define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
+				     struct uffdio_range)
+
+/*
+ * Valid bits below PAGE_SHIFT in the userfault address read through
+ * the read() syscall.
+ */
+#define UFFD_BIT_WRITE	(1<<0)	/* this was a write fault, MISSING or WP */
+#define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
+#define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
+
+struct uffdio_api {
+	/* userland asks for an API number */
+	__u64 api;
+
+	/* kernel answers below with the available features for the API */
+	__u64 bits;
+	__u64 ioctls;
+};
+
+struct uffdio_range {
+	__u64 start;
+	__u64 len;
+};
+
+struct uffdio_register {
+	struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+	__u64 mode;
+
+	/*
+	 * kernel answers which ioctl commands are available for the
+	 * range, keep at the end as the last 8 bytes aren't read.
+	 */
+	__u64 ioctls;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
-- 
cgit v1.2.3


From 932b18e0aec65acb089f4bd8761ee85e70f8eb6a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:10 -0700
Subject: userfaultfd: linux/userfaultfd_k.h

Kernel header defining the methods needed by the VM common code to
interact with the userfaultfd.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 include/linux/userfaultfd_k.h

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
new file mode 100644
index 000000000000..e1e43609a179
--- /dev/null
+++ b/include/linux/userfaultfd_k.h
@@ -0,0 +1,79 @@
+/*
+ *  include/linux/userfaultfd_k.h
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_K_H
+#define _LINUX_USERFAULTFD_K_H
+
+#ifdef CONFIG_USERFAULTFD
+
+#include <linux/userfaultfd.h> /* linux/include/uapi/linux/userfaultfd.h */
+
+#include <linux/fcntl.h>
+
+/*
+ * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from userfaultfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
+#define UFFD_CLOEXEC O_CLOEXEC
+#define UFFD_NONBLOCK O_NONBLOCK
+
+#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
+#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
+
+extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+			    unsigned int flags, unsigned long reason);
+
+/* mm helpers */
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+					struct vm_userfaultfd_ctx vm_ctx)
+{
+	return vma->vm_userfaultfd_ctx.ctx == vm_ctx.ctx;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & VM_UFFD_MISSING;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP);
+}
+
+#else /* CONFIG_USERFAULTFD */
+
+/* mm helpers */
+static inline int handle_userfault(struct vm_area_struct *vma,
+				   unsigned long address,
+				   unsigned int flags,
+				   unsigned long reason)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
+					struct vm_userfaultfd_ctx vm_ctx)
+{
+	return true;
+}
+
+static inline bool userfaultfd_missing(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+static inline bool userfaultfd_armed(struct vm_area_struct *vma)
+{
+	return false;
+}
+
+#endif /* CONFIG_USERFAULTFD */
+
+#endif /* _LINUX_USERFAULTFD_K_H */
-- 
cgit v1.2.3


From 745f234be12b6191b15eae8dd415cc81a9137f47 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:14 -0700
Subject: userfaultfd: add vm_userfaultfd_ctx to the vm_area_struct

This adds the vm_userfaultfd_ctx to the vm_area_struct.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 11 +++++++++++
 kernel/fork.c            |  1 +
 2 files changed, 12 insertions(+)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 15549578d559..26a30c3566f0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -256,6 +256,16 @@ struct vm_region {
 						* this region */
 };
 
+#ifdef CONFIG_USERFAULTFD
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
+struct vm_userfaultfd_ctx {
+	struct userfaultfd_ctx *ctx;
+};
+#else /* CONFIG_USERFAULTFD */
+#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
+struct vm_userfaultfd_ctx {};
+#endif /* CONFIG_USERFAULTFD */
+
 /*
  * This struct defines a memory VMM memory area. There is one of these
  * per VM-area/task.  A VM area is any part of the process virtual memory
@@ -322,6 +332,7 @@ struct vm_area_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
+	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 };
 
 struct core_thread {
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..ceb4eb4abb9d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -456,6 +456,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto fail_nomem_anon_vma_fork;
 		tmp->vm_flags &= ~VM_LOCKED;
 		tmp->vm_next = tmp->vm_prev = NULL;
+		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
-- 
cgit v1.2.3


From 16ba6f811dfe44bc14f7946a4b257b85476fc16e Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:17 -0700
Subject: userfaultfd: add VM_UFFD_MISSING and VM_UFFD_WP

These two flags gets set in vma->vm_flags to tell the VM common code
if the userfaultfd is armed and in which mode (only tracking missing
faults, only tracking wrprotect faults or both). If neither flags is
set it means the userfaultfd is not armed on the vma.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 ++
 include/linux/mm.h | 2 ++
 kernel/fork.c      | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..3b4d8255e806 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -597,6 +597,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_HUGEPAGE)]	= "hg",
 		[ilog2(VM_NOHUGEPAGE)]	= "nh",
 		[ilog2(VM_MERGEABLE)]	= "mg",
+		[ilog2(VM_UFFD_MISSING)]= "um",
+		[ilog2(VM_UFFD_WP)]	= "uw",
 	};
 	size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bf6f117fcf4d..0f7cd30039ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -124,8 +124,10 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE	0x00000080
 
 #define VM_GROWSDOWN	0x00000100	/* general info on the segment */
+#define VM_UFFD_MISSING	0x00000200	/* missing pages tracking */
 #define VM_PFNMAP	0x00000400	/* Page-ranges managed without "struct page", just pure PFN */
 #define VM_DENYWRITE	0x00000800	/* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP	0x00001000	/* wrprotect pages tracking */
 
 #define VM_LOCKED	0x00002000
 #define VM_IO           0x00004000	/* Memory mapped I/O or similar */
diff --git a/kernel/fork.c b/kernel/fork.c
index ceb4eb4abb9d..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,7 +454,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		tmp->vm_mm = mm;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
-		tmp->vm_flags &= ~VM_LOCKED;
+		tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
 		tmp->vm_next = tmp->vm_prev = NULL;
 		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
 		file = tmp->vm_file;
-- 
cgit v1.2.3


From 6b251fc96cf2cdf1ce4b5db055547e2a5679bc77 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:20 -0700
Subject: userfaultfd: call handle_userfault() for userfaultfd_missing() faults

This is where the page faults must be modified to call
handle_userfault() if userfaultfd_missing() is true (so if the
vma->vm_flags had VM_UFFD_MISSING set).

handle_userfault() then takes care of blocking the page fault and
delivering it to userland.

The fault flags must also be passed as parameter so the "read|write"
kind of fault can be passed to userland.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 69 ++++++++++++++++++++++++++++++++++++++------------------
 mm/memory.c      | 16 +++++++++++++
 2 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 097c7a4bfbd9..7735f99931fa 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -23,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -717,7 +718,8 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct vm_area_struct *vma,
 					unsigned long haddr, pmd_t *pmd,
-					struct page *page, gfp_t gfp)
+					struct page *page, gfp_t gfp,
+					unsigned int flags)
 {
 	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
@@ -725,12 +727,16 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
-		return VM_FAULT_OOM;
+	if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+		put_page(page);
+		count_vm_event(THP_FAULT_FALLBACK);
+		return VM_FAULT_FALLBACK;
+	}
 
 	pgtable = pte_alloc_one(mm, haddr);
 	if (unlikely(!pgtable)) {
 		mem_cgroup_cancel_charge(page, memcg);
+		put_page(page);
 		return VM_FAULT_OOM;
 	}
 
@@ -750,6 +756,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		pte_free(mm, pgtable);
 	} else {
 		pmd_t entry;
+
+		/* Deliver the page fault to userland */
+		if (userfaultfd_missing(vma)) {
+			int ret;
+
+			spin_unlock(ptl);
+			mem_cgroup_cancel_charge(page, memcg);
+			put_page(page);
+			pte_free(mm, pgtable);
+			ret = handle_userfault(vma, haddr, flags,
+					       VM_UFFD_MISSING);
+			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+			return ret;
+		}
+
 		entry = mk_huge_pmd(page, vma->vm_page_prot);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		page_add_new_anon_rmap(page, vma, haddr);
@@ -760,6 +781,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
 		atomic_long_inc(&mm->nr_ptes);
 		spin_unlock(ptl);
+		count_vm_event(THP_FAULT_ALLOC);
 	}
 
 	return 0;
@@ -771,19 +793,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 }
 
 /* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 		struct page *zero_page)
 {
 	pmd_t entry;
-	if (!pmd_none(*pmd))
-		return false;
 	entry = mk_pmd(zero_page, vma->vm_page_prot);
 	entry = pmd_mkhuge(entry);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
 	atomic_long_inc(&mm->nr_ptes);
-	return true;
 }
 
 int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgtable_t pgtable;
 		struct page *zero_page;
 		bool set;
+		int ret;
 		pgtable = pte_alloc_one(mm, haddr);
 		if (unlikely(!pgtable))
 			return VM_FAULT_OOM;
@@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			return VM_FAULT_FALLBACK;
 		}
 		ptl = pmd_lock(mm, pmd);
-		set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
-				zero_page);
-		spin_unlock(ptl);
+		ret = 0;
+		set = false;
+		if (pmd_none(*pmd)) {
+			if (userfaultfd_missing(vma)) {
+				spin_unlock(ptl);
+				ret = handle_userfault(vma, haddr, flags,
+						       VM_UFFD_MISSING);
+				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
+			} else {
+				set_huge_zero_page(pgtable, mm, vma,
+						   haddr, pmd,
+						   zero_page);
+				spin_unlock(ptl);
+				set = true;
+			}
+		} else
+			spin_unlock(ptl);
 		if (!set) {
 			pte_free(mm, pgtable);
 			put_huge_zero_page();
 		}
-		return 0;
+		return ret;
 	}
 	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
 	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -831,14 +865,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
-	if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
-		put_page(page);
-		count_vm_event(THP_FAULT_FALLBACK);
-		return VM_FAULT_FALLBACK;
-	}
-
-	count_vm_event(THP_FAULT_ALLOC);
-	return 0;
+	return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp, flags);
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -873,16 +900,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 */
 	if (is_huge_zero_pmd(pmd)) {
 		struct page *zero_page;
-		bool set;
 		/*
 		 * get_huge_zero_page() will never allocate a new page here,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
 		zero_page = get_huge_zero_page();
-		set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
+		set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
 				zero_page);
-		BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */
 		ret = 0;
 		goto out_unlock;
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 388dcf9aa283..2961fb654369 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,7 @@
 #include <linux/string.h>
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 		if (!pte_none(*page_table))
 			goto unlock;
+		/* Deliver the page fault to userland, check inside PT lock */
+		if (userfaultfd_missing(vma)) {
+			pte_unmap_unlock(page_table, ptl);
+			return handle_userfault(vma, address, flags,
+						VM_UFFD_MISSING);
+		}
 		goto setpte;
 	}
 
@@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte_none(*page_table))
 		goto release;
 
+	/* Deliver the page fault to userland, check inside PT lock */
+	if (userfaultfd_missing(vma)) {
+		pte_unmap_unlock(page_table, ptl);
+		mem_cgroup_cancel_charge(page, memcg);
+		page_cache_release(page);
+		return handle_userfault(vma, address, flags,
+					VM_UFFD_MISSING);
+	}
+
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, address);
 	mem_cgroup_commit_charge(page, memcg, false);
-- 
cgit v1.2.3


From 19a809afe2fe089317226bbe5c5a1ce7f53dcdca Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:24 -0700
Subject: userfaultfd: teach vma_merge to merge across vma->vm_userfaultfd_ctx

vma->vm_userfaultfd_ctx is yet another vma parameter that vma_merge
must be aware about so that we can merge vmas back like they were
originally before arming the userfaultfd on some memory range.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/madvise.c       |  3 ++-
 mm/mempolicy.c     |  4 ++--
 mm/mlock.c         |  3 ++-
 mm/mmap.c          | 40 +++++++++++++++++++++++++++-------------
 mm/mprotect.c      |  3 ++-
 6 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0f7cd30039ea..77a9d609523e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1835,7 +1835,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 extern struct vm_area_struct *vma_merge(struct mm_struct *,
 	struct vm_area_struct *prev, unsigned long addr, unsigned long end,
 	unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-	struct mempolicy *);
+	struct mempolicy *, struct vm_userfaultfd_ctx);
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int split_vma(struct mm_struct *,
 	struct vm_area_struct *, unsigned long addr, int new_below);
diff --git a/mm/madvise.c b/mm/madvise.c
index 64bb8a22110c..911357973905 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
-				vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma->vm_userfaultfd_ctx);
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 99d4c1d0b858..a7f1e0d1d6b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 		pgoff = vma->vm_pgoff +
 			((vmstart - vma->vm_start) >> PAGE_SHIFT);
 		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-				  vma->anon_vma, vma->vm_file, pgoff,
-				  new_pol);
+				 vma->anon_vma, vma->vm_file, pgoff,
+				 new_pol, vma->vm_userfaultfd_ctx);
 		if (prev) {
 			vma = prev;
 			next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 6fd2cf15e868..25936680064f 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
-			  vma->vm_file, pgoff, vma_policy(vma));
+			  vma->vm_file, pgoff, vma_policy(vma),
+			  vma->vm_userfaultfd_ctx);
 	if (*prev) {
 		vma = *prev;
 		goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index f126923ce683..82db4fc0a9d3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -41,6 +41,7 @@
 #include <linux/notifier.h>
 #include <linux/memory.h>
 #include <linux/printk.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -919,7 +920,8 @@ again:			remove_next = 1 + (end > next->vm_end);
  * per-vma resources, so we don't attempt to merge those.
  */
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
-			struct file *file, unsigned long vm_flags)
+				struct file *file, unsigned long vm_flags,
+				struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
 	/*
 	 * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 		return 0;
 	if (vma->vm_ops && vma->vm_ops->close)
 		return 0;
+	if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
+		return 0;
 	return 1;
 }
 
@@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
  */
 static int
 can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+		     struct anon_vma *anon_vma, struct file *file,
+		     pgoff_t vm_pgoff,
+		     struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		if (vma->vm_pgoff == vm_pgoff)
 			return 1;
@@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
  */
 static int
 can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
-	struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
+		    struct anon_vma *anon_vma, struct file *file,
+		    pgoff_t vm_pgoff,
+		    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
-	if (is_mergeable_vma(vma, file, vm_flags) &&
+	if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
 	    is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
 		pgoff_t vm_pglen;
 		vm_pglen = vma_pages(vma);
@@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 			struct vm_area_struct *prev, unsigned long addr,
 			unsigned long end, unsigned long vm_flags,
 			struct anon_vma *anon_vma, struct file *file,
-			pgoff_t pgoff, struct mempolicy *policy)
+			pgoff_t pgoff, struct mempolicy *policy,
+			struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
 {
 	pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
 	struct vm_area_struct *area, *next;
@@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	if (prev && prev->vm_end == addr &&
 			mpol_equal(vma_policy(prev), policy) &&
 			can_vma_merge_after(prev, vm_flags,
-						anon_vma, file, pgoff)) {
+					    anon_vma, file, pgoff,
+					    vm_userfaultfd_ctx)) {
 		/*
 		 * OK, it can.  Can we now merge in the successor as well?
 		 */
 		if (next && end == next->vm_start &&
 				mpol_equal(policy, vma_policy(next)) &&
 				can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen) &&
+						     anon_vma, file,
+						     pgoff+pglen,
+						     vm_userfaultfd_ctx) &&
 				is_mergeable_anon_vma(prev->anon_vma,
 						      next->anon_vma, NULL)) {
 							/* cases 1, 6 */
@@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 	if (next && end == next->vm_start &&
 			mpol_equal(policy, vma_policy(next)) &&
 			can_vma_merge_before(next, vm_flags,
-					anon_vma, file, pgoff+pglen)) {
+					     anon_vma, file, pgoff+pglen,
+					     vm_userfaultfd_ctx)) {
 		if (prev && addr < prev->vm_end)	/* case 4 */
 			err = vma_adjust(prev, prev->vm_start,
 				addr, prev->vm_pgoff, NULL);
@@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	/*
 	 * Can we just expand an old mapping?
 	 */
-	vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
-			NULL);
+	vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
+			NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
 	if (vma)
 		goto out;
 
@@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
-					NULL, NULL, pgoff, NULL);
+			NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
 	if (vma)
 		goto out;
 
@@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
 		return NULL;	/* should never get here */
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			    vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			    vma->vm_userfaultfd_ctx);
 	if (new_vma) {
 		/*
 		 * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7d6f1171ecb..ef5be8eaab00 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 */
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*pprev = vma_merge(mm, *pprev, start, end, newflags,
-			vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
+			   vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
+			   vma->vm_userfaultfd_ctx);
 	if (*pprev) {
 		vma = *pprev;
 		goto success;
-- 
cgit v1.2.3


From c1294d05de5df1ab8c93aa13c531782ede907e14 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:27 -0700
Subject: userfaultfd: prevent khugepaged to merge if userfaultfd is armed

If userfaultfd is armed on a certain vma we can't "fill" the holes with
zeroes or we'll break the userland on demand paging.  The holes if the
userfault is armed, are really missing information (not zeroes) that the
userland has to load from network or elsewhere.

The same issue happens for wrprotected ptes that we can't just convert
into a single writable pmd_trans_huge.

We could however in theory still merge across zeropages if only
VM_UFFD_MISSING is set (so if VM_UFFD_WP is not set)...  that could be
slightly improved but it'd be much more complex code for a tiny corner
case.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7735f99931fa..d38aaf9dcba6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2158,7 +2158,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 	     _pte++, address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-			if (++none_or_zero <= khugepaged_max_ptes_none)
+			if (!userfaultfd_armed(vma) &&
+			    ++none_or_zero <= khugepaged_max_ptes_none)
 				continue;
 			else
 				goto out;
@@ -2611,7 +2612,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
-			if (++none_or_zero <= khugepaged_max_ptes_none)
+			if (!userfaultfd_armed(vma) &&
+			    ++none_or_zero <= khugepaged_max_ptes_none)
 				continue;
 			else
 				goto out_unmap;
-- 
cgit v1.2.3


From 86039bd3b4e6a1129318cbfed4e0a6e001656635 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:31 -0700
Subject: userfaultfd: add new syscall to provide memory externalization

Once an userfaultfd has been created and certain region of the process
virtual address space have been registered into it, the thread responsible
for doing the memory externalization can manage the page faults in
userland by talking to the kernel using the userfaultfd protocol.

poll() can be used to know when there are new pending userfaults to be
read (POLLIN).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 1036 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1036 insertions(+)
 create mode 100644 fs/userfaultfd.c

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
new file mode 100644
index 000000000000..9bc256d1a143
--- /dev/null
+++ b/fs/userfaultfd.c
@@ -0,0 +1,1036 @@
+/*
+ *  fs/userfaultfd.c
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2008-2009 Red Hat, Inc.
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ *  Some part derived from fs/eventfd.c (anon inode setup) and
+ *  mm/ksm.c (mm hashing).
+ */
+
+#include <linux/hashtable.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/file.h>
+#include <linux/bug.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mempolicy.h>
+#include <linux/ioctl.h>
+#include <linux/security.h>
+
+enum userfaultfd_state {
+	UFFD_STATE_WAIT_API,
+	UFFD_STATE_RUNNING,
+};
+
+struct userfaultfd_ctx {
+	/* pseudo fd refcounting */
+	atomic_t refcount;
+	/* waitqueue head for the userfaultfd page faults */
+	wait_queue_head_t fault_wqh;
+	/* waitqueue head for the pseudo fd to wakeup poll/read */
+	wait_queue_head_t fd_wqh;
+	/* userfaultfd syscall flags */
+	unsigned int flags;
+	/* state machine */
+	enum userfaultfd_state state;
+	/* released */
+	bool released;
+	/* mm with one ore more vmas attached to this userfaultfd_ctx */
+	struct mm_struct *mm;
+};
+
+struct userfaultfd_wait_queue {
+	unsigned long address;
+	wait_queue_t wq;
+	bool pending;
+	struct userfaultfd_ctx *ctx;
+};
+
+struct userfaultfd_wake_range {
+	unsigned long start;
+	unsigned long len;
+};
+
+static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
+				     int wake_flags, void *key)
+{
+	struct userfaultfd_wake_range *range = key;
+	int ret;
+	struct userfaultfd_wait_queue *uwq;
+	unsigned long start, len;
+
+	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+	ret = 0;
+	/* don't wake the pending ones to avoid reads to block */
+	if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released))
+		goto out;
+	/* len == 0 means wake all */
+	start = range->start;
+	len = range->len;
+	if (len && (start > uwq->address || start + len <= uwq->address))
+		goto out;
+	ret = wake_up_state(wq->private, mode);
+	if (ret)
+		/*
+		 * Wake only once, autoremove behavior.
+		 *
+		 * After the effect of list_del_init is visible to the
+		 * other CPUs, the waitqueue may disappear from under
+		 * us, see the !list_empty_careful() in
+		 * handle_userfault(). try_to_wake_up() has an
+		 * implicit smp_mb__before_spinlock, and the
+		 * wq->private is read before calling the extern
+		 * function "wake_up_state" (which in turns calls
+		 * try_to_wake_up). While the spin_lock;spin_unlock;
+		 * wouldn't be enough, the smp_mb__before_spinlock is
+		 * enough to avoid an explicit smp_mb() here.
+		 */
+		list_del_init(&wq->task_list);
+out:
+	return ret;
+}
+
+/**
+ * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to the userfaultfd context.
+ *
+ * Returns: In case of success, returns not zero.
+ */
+static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
+{
+	if (!atomic_inc_not_zero(&ctx->refcount))
+		BUG();
+}
+
+/**
+ * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
+ * context.
+ * @ctx: [in] Pointer to userfaultfd context.
+ *
+ * The userfaultfd context reference must have been previously acquired either
+ * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
+ */
+static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount)) {
+		VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
+		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
+		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
+		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
+		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+		mmput(ctx->mm);
+		kfree(ctx);
+	}
+}
+
+static inline unsigned long userfault_address(unsigned long address,
+					      unsigned int flags,
+					      unsigned long reason)
+{
+	BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS);
+	address &= PAGE_MASK;
+	if (flags & FAULT_FLAG_WRITE)
+		/*
+		 * Encode "write" fault information in the LSB of the
+		 * address read by userland, without depending on
+		 * FAULT_FLAG_WRITE kernel internal value.
+		 */
+		address |= UFFD_BIT_WRITE;
+	if (reason & VM_UFFD_WP)
+		/*
+		 * Encode "reason" fault information as bit number 1
+		 * in the address read by userland. If bit number 1 is
+		 * clear it means the reason is a VM_FAULT_MISSING
+		 * fault.
+		 */
+		address |= UFFD_BIT_WP;
+	return address;
+}
+
+/*
+ * The locking rules involved in returning VM_FAULT_RETRY depending on
+ * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
+ * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
+ * recommendation in __lock_page_or_retry is not an understatement.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_sem must be released
+ * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
+ * not set.
+ *
+ * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
+ * set, VM_FAULT_RETRY can still be returned if and only if there are
+ * fatal_signal_pending()s, and the mmap_sem must be released before
+ * returning it.
+ */
+int handle_userfault(struct vm_area_struct *vma, unsigned long address,
+		     unsigned int flags, unsigned long reason)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_wait_queue uwq;
+
+	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (!ctx)
+		return VM_FAULT_SIGBUS;
+
+	BUG_ON(ctx->mm != mm);
+
+	VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP));
+	VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP));
+
+	/*
+	 * If it's already released don't get it. This avoids to loop
+	 * in __get_user_pages if userfaultfd_release waits on the
+	 * caller of handle_userfault to release the mmap_sem.
+	 */
+	if (unlikely(ACCESS_ONCE(ctx->released)))
+		return VM_FAULT_SIGBUS;
+
+	/*
+	 * Check that we can return VM_FAULT_RETRY.
+	 *
+	 * NOTE: it should become possible to return VM_FAULT_RETRY
+	 * even if FAULT_FLAG_TRIED is set without leading to gup()
+	 * -EBUSY failures, if the userfaultfd is to be extended for
+	 * VM_UFFD_WP tracking and we intend to arm the userfault
+	 * without first stopping userland access to the memory. For
+	 * VM_UFFD_MISSING userfaults this is enough for now.
+	 */
+	if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) {
+		/*
+		 * Validate the invariant that nowait must allow retry
+		 * to be sure not to return SIGBUS erroneously on
+		 * nowait invocations.
+		 */
+		BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT);
+#ifdef CONFIG_DEBUG_VM
+		if (printk_ratelimit()) {
+			printk(KERN_WARNING
+			       "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags);
+			dump_stack();
+		}
+#endif
+		return VM_FAULT_SIGBUS;
+	}
+
+	/*
+	 * Handle nowait, not much to do other than tell it to retry
+	 * and wait.
+	 */
+	if (flags & FAULT_FLAG_RETRY_NOWAIT)
+		return VM_FAULT_RETRY;
+
+	/* take the reference before dropping the mmap_sem */
+	userfaultfd_ctx_get(ctx);
+
+	/* be gentle and immediately relinquish the mmap_sem */
+	up_read(&mm->mmap_sem);
+
+	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
+	uwq.wq.private = current;
+	uwq.address = userfault_address(address, flags, reason);
+	uwq.pending = true;
+	uwq.ctx = ctx;
+
+	spin_lock(&ctx->fault_wqh.lock);
+	/*
+	 * After the __add_wait_queue the uwq is visible to userland
+	 * through poll/read().
+	 */
+	__add_wait_queue(&ctx->fault_wqh, &uwq.wq);
+	for (;;) {
+		set_current_state(TASK_KILLABLE);
+		if (!uwq.pending || ACCESS_ONCE(ctx->released) ||
+		    fatal_signal_pending(current))
+			break;
+		spin_unlock(&ctx->fault_wqh.lock);
+
+		wake_up_poll(&ctx->fd_wqh, POLLIN);
+		schedule();
+
+		spin_lock(&ctx->fault_wqh.lock);
+	}
+	__remove_wait_queue(&ctx->fault_wqh, &uwq.wq);
+	__set_current_state(TASK_RUNNING);
+	spin_unlock(&ctx->fault_wqh.lock);
+
+	/*
+	 * ctx may go away after this if the userfault pseudo fd is
+	 * already released.
+	 */
+	userfaultfd_ctx_put(ctx);
+
+	return VM_FAULT_RETRY;
+}
+
+static int userfaultfd_release(struct inode *inode, struct file *file)
+{
+	struct userfaultfd_ctx *ctx = file->private_data;
+	struct mm_struct *mm = ctx->mm;
+	struct vm_area_struct *vma, *prev;
+	/* len == 0 means wake all */
+	struct userfaultfd_wake_range range = { .len = 0, };
+	unsigned long new_flags;
+
+	ACCESS_ONCE(ctx->released) = true;
+
+	/*
+	 * Flush page faults out of all CPUs. NOTE: all page faults
+	 * must be retried without returning VM_FAULT_SIGBUS if
+	 * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+	 * changes while handle_userfault released the mmap_sem. So
+	 * it's critical that released is set to true (above), before
+	 * taking the mmap_sem for writing.
+	 */
+	down_write(&mm->mmap_sem);
+	prev = NULL;
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		cond_resched();
+		BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+		       !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+		if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+			prev = vma;
+			continue;
+		}
+		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+		prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end,
+				 new_flags, vma->anon_vma,
+				 vma->vm_file, vma->vm_pgoff,
+				 vma_policy(vma),
+				 NULL_VM_UFFD_CTX);
+		if (prev)
+			vma = prev;
+		else
+			prev = vma;
+		vma->vm_flags = new_flags;
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+	}
+	up_write(&mm->mmap_sem);
+
+	/*
+	 * After no new page faults can wait on this fault_wqh, flush
+	 * the last page faults that may have been already waiting on
+	 * the fault_wqh.
+	 */
+	spin_lock(&ctx->fault_wqh.lock);
+	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+	spin_unlock(&ctx->fault_wqh.lock);
+
+	wake_up_poll(&ctx->fd_wqh, POLLHUP);
+	userfaultfd_ctx_put(ctx);
+	return 0;
+}
+
+/* fault_wqh.lock must be hold by the caller */
+static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx,
+					  struct userfaultfd_wait_queue **uwq)
+{
+	wait_queue_t *wq;
+	struct userfaultfd_wait_queue *_uwq;
+	unsigned int ret = 0;
+
+	VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock));
+
+	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+		_uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+		if (_uwq->pending) {
+			ret = POLLIN;
+			if (!uwq)
+				/*
+				 * If there's at least a pending and
+				 * we don't care which one it is,
+				 * break immediately and leverage the
+				 * efficiency of the LIFO walk.
+				 */
+				break;
+			/*
+			 * If we need to find which one was pending we
+			 * keep walking until we find the first not
+			 * pending one, so we read() them in FIFO order.
+			 */
+			*uwq = _uwq;
+		} else
+			/*
+			 * break the loop at the first not pending
+			 * one, there cannot be pending userfaults
+			 * after the first not pending one, because
+			 * all new pending ones are inserted at the
+			 * head and we walk it in LIFO.
+			 */
+			break;
+	}
+
+	return ret;
+}
+
+static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
+{
+	struct userfaultfd_ctx *ctx = file->private_data;
+	unsigned int ret;
+
+	poll_wait(file, &ctx->fd_wqh, wait);
+
+	switch (ctx->state) {
+	case UFFD_STATE_WAIT_API:
+		return POLLERR;
+	case UFFD_STATE_RUNNING:
+		spin_lock(&ctx->fault_wqh.lock);
+		ret = find_userfault(ctx, NULL);
+		spin_unlock(&ctx->fault_wqh.lock);
+		return ret;
+	default:
+		BUG();
+	}
+}
+
+static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
+				    __u64 *addr)
+{
+	ssize_t ret;
+	DECLARE_WAITQUEUE(wait, current);
+	struct userfaultfd_wait_queue *uwq = NULL;
+
+	/* always take the fd_wqh lock before the fault_wqh lock */
+	spin_lock(&ctx->fd_wqh.lock);
+	__add_wait_queue(&ctx->fd_wqh, &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_lock(&ctx->fault_wqh.lock);
+		if (find_userfault(ctx, &uwq)) {
+			/*
+			 * The fault_wqh.lock prevents the uwq to
+			 * disappear from under us.
+			 */
+			uwq->pending = false;
+			/* careful to always initialize addr if ret == 0 */
+			*addr = uwq->address;
+			spin_unlock(&ctx->fault_wqh.lock);
+			ret = 0;
+			break;
+		}
+		spin_unlock(&ctx->fault_wqh.lock);
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+		if (no_wait) {
+			ret = -EAGAIN;
+			break;
+		}
+		spin_unlock(&ctx->fd_wqh.lock);
+		schedule();
+		spin_lock(&ctx->fd_wqh.lock);
+	}
+	__remove_wait_queue(&ctx->fd_wqh, &wait);
+	__set_current_state(TASK_RUNNING);
+	spin_unlock(&ctx->fd_wqh.lock);
+
+	return ret;
+}
+
+static ssize_t userfaultfd_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	struct userfaultfd_ctx *ctx = file->private_data;
+	ssize_t _ret, ret = 0;
+	/* careful to always initialize addr if ret == 0 */
+	__u64 uninitialized_var(addr);
+	int no_wait = file->f_flags & O_NONBLOCK;
+
+	if (ctx->state == UFFD_STATE_WAIT_API)
+		return -EINVAL;
+	BUG_ON(ctx->state != UFFD_STATE_RUNNING);
+
+	for (;;) {
+		if (count < sizeof(addr))
+			return ret ? ret : -EINVAL;
+		_ret = userfaultfd_ctx_read(ctx, no_wait, &addr);
+		if (_ret < 0)
+			return ret ? ret : _ret;
+		if (put_user(addr, (__u64 __user *) buf))
+			return ret ? ret : -EFAULT;
+		ret += sizeof(addr);
+		buf += sizeof(addr);
+		count -= sizeof(addr);
+		/*
+		 * Allow to read more than one fault at time but only
+		 * block if waiting for the very first one.
+		 */
+		no_wait = O_NONBLOCK;
+	}
+}
+
+static void __wake_userfault(struct userfaultfd_ctx *ctx,
+			     struct userfaultfd_wake_range *range)
+{
+	unsigned long start, end;
+
+	start = range->start;
+	end = range->start + range->len;
+
+	spin_lock(&ctx->fault_wqh.lock);
+	/* wake all in the range and autoremove */
+	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+	spin_unlock(&ctx->fault_wqh.lock);
+}
+
+static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
+					   struct userfaultfd_wake_range *range)
+{
+	/*
+	 * To be sure waitqueue_active() is not reordered by the CPU
+	 * before the pagetable update, use an explicit SMP memory
+	 * barrier here. PT lock release or up_read(mmap_sem) still
+	 * have release semantics that can allow the
+	 * waitqueue_active() to be reordered before the pte update.
+	 */
+	smp_mb();
+
+	/*
+	 * Use waitqueue_active because it's very frequent to
+	 * change the address space atomically even if there are no
+	 * userfaults yet. So we take the spinlock only when we're
+	 * sure we've userfaults to wake.
+	 */
+	if (waitqueue_active(&ctx->fault_wqh))
+		__wake_userfault(ctx, range);
+}
+
+static __always_inline int validate_range(struct mm_struct *mm,
+					  __u64 start, __u64 len)
+{
+	__u64 task_size = mm->task_size;
+
+	if (start & ~PAGE_MASK)
+		return -EINVAL;
+	if (len & ~PAGE_MASK)
+		return -EINVAL;
+	if (!len)
+		return -EINVAL;
+	if (start < mmap_min_addr)
+		return -EINVAL;
+	if (start >= task_size)
+		return -EINVAL;
+	if (len > task_size - start)
+		return -EINVAL;
+	return 0;
+}
+
+static int userfaultfd_register(struct userfaultfd_ctx *ctx,
+				unsigned long arg)
+{
+	struct mm_struct *mm = ctx->mm;
+	struct vm_area_struct *vma, *prev, *cur;
+	int ret;
+	struct uffdio_register uffdio_register;
+	struct uffdio_register __user *user_uffdio_register;
+	unsigned long vm_flags, new_flags;
+	bool found;
+	unsigned long start, end, vma_end;
+
+	user_uffdio_register = (struct uffdio_register __user *) arg;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_register, user_uffdio_register,
+			   sizeof(uffdio_register)-sizeof(__u64)))
+		goto out;
+
+	ret = -EINVAL;
+	if (!uffdio_register.mode)
+		goto out;
+	if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING|
+				     UFFDIO_REGISTER_MODE_WP))
+		goto out;
+	vm_flags = 0;
+	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
+		vm_flags |= VM_UFFD_MISSING;
+	if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
+		vm_flags |= VM_UFFD_WP;
+		/*
+		 * FIXME: remove the below error constraint by
+		 * implementing the wprotect tracking mode.
+		 */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = validate_range(mm, uffdio_register.range.start,
+			     uffdio_register.range.len);
+	if (ret)
+		goto out;
+
+	start = uffdio_register.range.start;
+	end = start + uffdio_register.range.len;
+
+	down_write(&mm->mmap_sem);
+	vma = find_vma_prev(mm, start, &prev);
+
+	ret = -ENOMEM;
+	if (!vma)
+		goto out_unlock;
+
+	/* check that there's at least one vma in the range */
+	ret = -EINVAL;
+	if (vma->vm_start >= end)
+		goto out_unlock;
+
+	/*
+	 * Search for not compatible vmas.
+	 *
+	 * FIXME: this shall be relaxed later so that it doesn't fail
+	 * on tmpfs backed vmas (in addition to the current allowance
+	 * on anonymous vmas).
+	 */
+	found = false;
+	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+		cond_resched();
+
+		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+		/* check not compatible vmas */
+		ret = -EINVAL;
+		if (cur->vm_ops)
+			goto out_unlock;
+
+		/*
+		 * Check that this vma isn't already owned by a
+		 * different userfaultfd. We can't allow more than one
+		 * userfaultfd to own a single vma simultaneously or we
+		 * wouldn't know which one to deliver the userfaults to.
+		 */
+		ret = -EBUSY;
+		if (cur->vm_userfaultfd_ctx.ctx &&
+		    cur->vm_userfaultfd_ctx.ctx != ctx)
+			goto out_unlock;
+
+		found = true;
+	}
+	BUG_ON(!found);
+
+	if (vma->vm_start < start)
+		prev = vma;
+
+	ret = 0;
+	do {
+		cond_resched();
+
+		BUG_ON(vma->vm_ops);
+		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+		       vma->vm_userfaultfd_ctx.ctx != ctx);
+
+		/*
+		 * Nothing to do: this vma is already registered into this
+		 * userfaultfd and with the right tracking mode too.
+		 */
+		if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+		    (vma->vm_flags & vm_flags) == vm_flags)
+			goto skip;
+
+		if (vma->vm_start > start)
+			start = vma->vm_start;
+		vma_end = min(end, vma->vm_end);
+
+		new_flags = (vma->vm_flags & ~vm_flags) | vm_flags;
+		prev = vma_merge(mm, prev, start, vma_end, new_flags,
+				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+				 vma_policy(vma),
+				 ((struct vm_userfaultfd_ctx){ ctx }));
+		if (prev) {
+			vma = prev;
+			goto next;
+		}
+		if (vma->vm_start < start) {
+			ret = split_vma(mm, vma, start, 1);
+			if (ret)
+				break;
+		}
+		if (vma->vm_end > end) {
+			ret = split_vma(mm, vma, end, 0);
+			if (ret)
+				break;
+		}
+	next:
+		/*
+		 * In the vma_merge() successful mprotect-like case 8:
+		 * the next vma was merged into the current one and
+		 * the current one has not been updated yet.
+		 */
+		vma->vm_flags = new_flags;
+		vma->vm_userfaultfd_ctx.ctx = ctx;
+
+	skip:
+		prev = vma;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	} while (vma && vma->vm_start < end);
+out_unlock:
+	up_write(&mm->mmap_sem);
+	if (!ret) {
+		/*
+		 * Now that we scanned all vmas we can already tell
+		 * userland which ioctls methods are guaranteed to
+		 * succeed on this range.
+		 */
+		if (put_user(UFFD_API_RANGE_IOCTLS,
+			     &user_uffdio_register->ioctls))
+			ret = -EFAULT;
+	}
+out:
+	return ret;
+}
+
+static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
+				  unsigned long arg)
+{
+	struct mm_struct *mm = ctx->mm;
+	struct vm_area_struct *vma, *prev, *cur;
+	int ret;
+	struct uffdio_range uffdio_unregister;
+	unsigned long new_flags;
+	bool found;
+	unsigned long start, end, vma_end;
+	const void __user *buf = (void __user *)arg;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+		goto out;
+
+	ret = validate_range(mm, uffdio_unregister.start,
+			     uffdio_unregister.len);
+	if (ret)
+		goto out;
+
+	start = uffdio_unregister.start;
+	end = start + uffdio_unregister.len;
+
+	down_write(&mm->mmap_sem);
+	vma = find_vma_prev(mm, start, &prev);
+
+	ret = -ENOMEM;
+	if (!vma)
+		goto out_unlock;
+
+	/* check that there's at least one vma in the range */
+	ret = -EINVAL;
+	if (vma->vm_start >= end)
+		goto out_unlock;
+
+	/*
+	 * Search for not compatible vmas.
+	 *
+	 * FIXME: this shall be relaxed later so that it doesn't fail
+	 * on tmpfs backed vmas (in addition to the current allowance
+	 * on anonymous vmas).
+	 */
+	found = false;
+	ret = -EINVAL;
+	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+		cond_resched();
+
+		BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
+		       !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP)));
+
+		/*
+		 * Check not compatible vmas, not strictly required
+		 * here as not compatible vmas cannot have an
+		 * userfaultfd_ctx registered on them, but this
+		 * provides for more strict behavior to notice
+		 * unregistration errors.
+		 */
+		if (cur->vm_ops)
+			goto out_unlock;
+
+		found = true;
+	}
+	BUG_ON(!found);
+
+	if (vma->vm_start < start)
+		prev = vma;
+
+	ret = 0;
+	do {
+		cond_resched();
+
+		BUG_ON(vma->vm_ops);
+
+		/*
+		 * Nothing to do: this vma is already registered into this
+		 * userfaultfd and with the right tracking mode too.
+		 */
+		if (!vma->vm_userfaultfd_ctx.ctx)
+			goto skip;
+
+		if (vma->vm_start > start)
+			start = vma->vm_start;
+		vma_end = min(end, vma->vm_end);
+
+		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
+		prev = vma_merge(mm, prev, start, vma_end, new_flags,
+				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+				 vma_policy(vma),
+				 NULL_VM_UFFD_CTX);
+		if (prev) {
+			vma = prev;
+			goto next;
+		}
+		if (vma->vm_start < start) {
+			ret = split_vma(mm, vma, start, 1);
+			if (ret)
+				break;
+		}
+		if (vma->vm_end > end) {
+			ret = split_vma(mm, vma, end, 0);
+			if (ret)
+				break;
+		}
+	next:
+		/*
+		 * In the vma_merge() successful mprotect-like case 8:
+		 * the next vma was merged into the current one and
+		 * the current one has not been updated yet.
+		 */
+		vma->vm_flags = new_flags;
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+	skip:
+		prev = vma;
+		start = vma->vm_end;
+		vma = vma->vm_next;
+	} while (vma && vma->vm_start < end);
+out_unlock:
+	up_write(&mm->mmap_sem);
+out:
+	return ret;
+}
+
+/*
+ * This is mostly needed to re-wakeup those userfaults that were still
+ * pending when userland wake them up the first time. We don't wake
+ * the pending one to avoid blocking reads to block, or non blocking
+ * read to return -EAGAIN, if used with POLLIN, to avoid userland
+ * doubts on why POLLIN wasn't reliable.
+ */
+static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
+			    unsigned long arg)
+{
+	int ret;
+	struct uffdio_range uffdio_wake;
+	struct userfaultfd_wake_range range;
+	const void __user *buf = (void __user *)arg;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+		goto out;
+
+	ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+	if (ret)
+		goto out;
+
+	range.start = uffdio_wake.start;
+	range.len = uffdio_wake.len;
+
+	/*
+	 * len == 0 means wake all and we don't want to wake all here,
+	 * so check it again to be sure.
+	 */
+	VM_BUG_ON(!range.len);
+
+	wake_userfault(ctx, &range);
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/*
+ * userland asks for a certain API version and we return which bits
+ * and ioctl commands are implemented in this kernel for such API
+ * version or -EINVAL if unknown.
+ */
+static int userfaultfd_api(struct userfaultfd_ctx *ctx,
+			   unsigned long arg)
+{
+	struct uffdio_api uffdio_api;
+	void __user *buf = (void __user *)arg;
+	int ret;
+
+	ret = -EINVAL;
+	if (ctx->state != UFFD_STATE_WAIT_API)
+		goto out;
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_api, buf, sizeof(__u64)))
+		goto out;
+	if (uffdio_api.api != UFFD_API) {
+		/* careful not to leak info, we only read the first 8 bytes */
+		memset(&uffdio_api, 0, sizeof(uffdio_api));
+		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+			goto out;
+		ret = -EINVAL;
+		goto out;
+	}
+	/* careful not to leak info, we only read the first 8 bytes */
+	uffdio_api.bits = UFFD_API_BITS;
+	uffdio_api.ioctls = UFFD_API_IOCTLS;
+	ret = -EFAULT;
+	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+		goto out;
+	ctx->state = UFFD_STATE_RUNNING;
+	ret = 0;
+out:
+	return ret;
+}
+
+static long userfaultfd_ioctl(struct file *file, unsigned cmd,
+			      unsigned long arg)
+{
+	int ret = -EINVAL;
+	struct userfaultfd_ctx *ctx = file->private_data;
+
+	switch(cmd) {
+	case UFFDIO_API:
+		ret = userfaultfd_api(ctx, arg);
+		break;
+	case UFFDIO_REGISTER:
+		ret = userfaultfd_register(ctx, arg);
+		break;
+	case UFFDIO_UNREGISTER:
+		ret = userfaultfd_unregister(ctx, arg);
+		break;
+	case UFFDIO_WAKE:
+		ret = userfaultfd_wake(ctx, arg);
+		break;
+	}
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct userfaultfd_ctx *ctx = f->private_data;
+	wait_queue_t *wq;
+	struct userfaultfd_wait_queue *uwq;
+	unsigned long pending = 0, total = 0;
+
+	spin_lock(&ctx->fault_wqh.lock);
+	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
+		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+		if (uwq->pending)
+			pending++;
+		total++;
+	}
+	spin_unlock(&ctx->fault_wqh.lock);
+
+	/*
+	 * If more protocols will be added, there will be all shown
+	 * separated by a space. Like this:
+	 *	protocols: aa:... bb:...
+	 */
+	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
+		   pending, total, UFFD_API, UFFD_API_BITS,
+		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
+}
+#endif
+
+static const struct file_operations userfaultfd_fops = {
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= userfaultfd_show_fdinfo,
+#endif
+	.release	= userfaultfd_release,
+	.poll		= userfaultfd_poll,
+	.read		= userfaultfd_read,
+	.unlocked_ioctl = userfaultfd_ioctl,
+	.compat_ioctl	= userfaultfd_ioctl,
+	.llseek		= noop_llseek,
+};
+
+/**
+ * userfaultfd_file_create - Creates an userfaultfd file pointer.
+ * @flags: Flags for the userfaultfd file.
+ *
+ * This function creates an userfaultfd file pointer, w/out installing
+ * it into the fd table. This is useful when the userfaultfd file is
+ * used during the initialization of data structures that require
+ * extra setup after the userfaultfd creation. So the userfaultfd
+ * creation is split into the file pointer creation phase, and the
+ * file descriptor installation phase.  In this way races with
+ * userspace closing the newly installed file descriptor can be
+ * avoided.  Returns an userfaultfd file pointer, or a proper error
+ * pointer.
+ */
+static struct file *userfaultfd_file_create(int flags)
+{
+	struct file *file;
+	struct userfaultfd_ctx *ctx;
+
+	BUG_ON(!current->mm);
+
+	/* Check the UFFD_* constants for consistency.  */
+	BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
+	BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
+
+	file = ERR_PTR(-EINVAL);
+	if (flags & ~UFFD_SHARED_FCNTL_FLAGS)
+		goto out;
+
+	file = ERR_PTR(-ENOMEM);
+	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		goto out;
+
+	atomic_set(&ctx->refcount, 1);
+	init_waitqueue_head(&ctx->fault_wqh);
+	init_waitqueue_head(&ctx->fd_wqh);
+	ctx->flags = flags;
+	ctx->state = UFFD_STATE_WAIT_API;
+	ctx->released = false;
+	ctx->mm = current->mm;
+	/* prevent the mm struct to be freed */
+	atomic_inc(&ctx->mm->mm_users);
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
+				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+	if (IS_ERR(file))
+		kfree(ctx);
+out:
+	return file;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+	int fd, error;
+	struct file *file;
+
+	error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
+	if (error < 0)
+		return error;
+	fd = error;
+
+	file = userfaultfd_file_create(flags);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	fd_install(fd, file);
+
+	return fd;
+
+err_put_unused_fd:
+	put_unused_fd(fd);
+
+	return error;
+}
-- 
cgit v1.2.3


From 3f602d2724b1f7d2d27ddcd7963a040a5890fd16 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Fri, 4 Sep 2015 15:46:34 -0700
Subject: userfaultfd: Rename uffd_api.bits into .features

This is (seems to be) the minimal thing that is required to unblock
standard uffd usage from the non-cooperative one.  Now more bits can be
added to the features field indicating e.g.  UFFD_FEATURE_FORK and others
needed for the latter use-case.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c                 |  4 ++--
 include/uapi/linux/userfaultfd.h | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 9bc256d1a143..0756d97b0666 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -884,7 +884,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 		goto out;
 	}
 	/* careful not to leak info, we only read the first 8 bytes */
-	uffdio_api.bits = UFFD_API_BITS;
+	uffdio_api.features = UFFD_API_FEATURES;
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
@@ -941,7 +941,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 	 *	protocols: aa:... bb:...
 	 */
 	seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
-		   pending, total, UFFD_API, UFFD_API_BITS,
+		   pending, total, UFFD_API, UFFD_API_FEATURES,
 		   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
 }
 #endif
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 09c2e2a8c9d6..330206016249 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -12,8 +12,8 @@
 #include <linux/types.h>
 
 #define UFFD_API ((__u64)0xAA)
-/* FIXME: add "|UFFD_BIT_WP" to UFFD_API_BITS after implementing it */
-#define UFFD_API_BITS (UFFD_BIT_WRITE)
+/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */
+#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -53,12 +53,18 @@
 #define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
 #define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
 
+/*
+ * Features reported in uffdio_api.features field
+ */
+#define UFFD_FEATURE_WRITE_BIT	(1<<0) /* Corresponds to UFFD_BIT_WRITE */
+#define UFFD_FEATURE_WP_BIT	(1<<1) /* Corresponds to UFFD_BIT_WP */
+
 struct uffdio_api {
 	/* userland asks for an API number */
 	__u64 api;
 
 	/* kernel answers below with the available features for the API */
-	__u64 bits;
+	__u64 features;
 	__u64 ioctls;
 };
 
-- 
cgit v1.2.3


From a9b85f9415fd9e529d03299e5335433f614ec1fb Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:37 -0700
Subject: userfaultfd: change the read API to return a uffd_msg

I had requests to return the full address (not the page aligned one) to
userland.

It's not entirely clear how the page offset could be relevant because
userfaults aren't like SIGBUS that can sigjump to a different place and it
actually skip resolving the fault depending on a page offset.  There's
currently no real way to skip the fault especially because after a
UFFDIO_COPY|ZEROPAGE, the fault is optimized to be retried within the
kernel without having to return to userland first (not even self modifying
code replacing the .text that touched the faulting address would prevent
the fault to be repeated).  Userland cannot skip repeating the fault even
more so if the fault was triggered by a KVM secondary page fault or any
get_user_pages or any copy-user inside some syscall which will return to
kernel code.  The second time FAULT_FLAG_RETRY_NOWAIT won't be set leading
to a SIGBUS being raised because the userfault can't wait if it cannot
release the mmap_map first (and FAULT_FLAG_RETRY_NOWAIT is required for
that).

Still returning userland a proper structure during the read() on the uffd,
can allow to use the current UFFD_API for the future non-cooperative
extensions too and it looks cleaner as well.  Once we get additional
fields there's no point to return the fault address page aligned anymore
to reuse the bits below PAGE_SHIFT.

The only downside is that the read() syscall will read 32bytes instead of
8bytes but that's not going to be measurable overhead.

The total number of new events that can be extended or of new future bits
for already shipped events, is limited to 64 by the features field of the
uffdio_api structure.  If more will be needed a bump of UFFD_API will be
required.

[akpm@linux-foundation.org: use __packed]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/userfaultfd.txt | 12 +++---
 fs/userfaultfd.c                 | 79 +++++++++++++++++++++++-----------------
 include/uapi/linux/userfaultfd.h | 70 +++++++++++++++++++++++++++--------
 3 files changed, 108 insertions(+), 53 deletions(-)

diff --git a/Documentation/vm/userfaultfd.txt b/Documentation/vm/userfaultfd.txt
index 90912925425e..70a3c94d1941 100644
--- a/Documentation/vm/userfaultfd.txt
+++ b/Documentation/vm/userfaultfd.txt
@@ -46,11 +46,13 @@ is a corner case that would currently return -EBUSY).
 When first opened the userfaultfd must be enabled invoking the
 UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
 a later API version) which will specify the read/POLLIN protocol
-userland intends to speak on the UFFD. The UFFDIO_API ioctl if
-successful (i.e. if the requested uffdio_api.api is spoken also by the
-running kernel), will return into uffdio_api.features and
-uffdio_api.ioctls two 64bit bitmasks of respectively the activated
-feature of the read(2) protocol and the generic ioctl available.
+userland intends to speak on the UFFD and the uffdio_api.features
+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
+requested uffdio_api.api is spoken also by the running kernel and the
+requested features are going to be enabled) will return into
+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
+respectively all the available features of the read(2) protocol and
+the generic ioctl available.
 
 Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
 be invoked (if present in the returned uffdio_api.ioctls bitmask) to
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0756d97b0666..1f2ddaaf3c03 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,7 +50,7 @@ struct userfaultfd_ctx {
 };
 
 struct userfaultfd_wait_queue {
-	unsigned long address;
+	struct uffd_msg msg;
 	wait_queue_t wq;
 	bool pending;
 	struct userfaultfd_ctx *ctx;
@@ -77,7 +77,8 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 	/* len == 0 means wake all */
 	start = range->start;
 	len = range->len;
-	if (len && (start > uwq->address || start + len <= uwq->address))
+	if (len && (start > uwq->msg.arg.pagefault.address ||
+		    start + len <= uwq->msg.arg.pagefault.address))
 		goto out;
 	ret = wake_up_state(wq->private, mode);
 	if (ret)
@@ -135,28 +136,43 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 	}
 }
 
-static inline unsigned long userfault_address(unsigned long address,
-					      unsigned int flags,
-					      unsigned long reason)
+static inline void msg_init(struct uffd_msg *msg)
 {
-	BUILD_BUG_ON(PAGE_SHIFT < UFFD_BITS);
-	address &= PAGE_MASK;
+	BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
+	/*
+	 * Must use memset to zero out the paddings or kernel data is
+	 * leaked to userland.
+	 */
+	memset(msg, 0, sizeof(struct uffd_msg));
+}
+
+static inline struct uffd_msg userfault_msg(unsigned long address,
+					    unsigned int flags,
+					    unsigned long reason)
+{
+	struct uffd_msg msg;
+	msg_init(&msg);
+	msg.event = UFFD_EVENT_PAGEFAULT;
+	msg.arg.pagefault.address = address;
 	if (flags & FAULT_FLAG_WRITE)
 		/*
-		 * Encode "write" fault information in the LSB of the
-		 * address read by userland, without depending on
-		 * FAULT_FLAG_WRITE kernel internal value.
+		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
+		 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
+		 * was a read fault, otherwise if set it means it's
+		 * a write fault.
 		 */
-		address |= UFFD_BIT_WRITE;
+		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
 	if (reason & VM_UFFD_WP)
 		/*
-		 * Encode "reason" fault information as bit number 1
-		 * in the address read by userland. If bit number 1 is
-		 * clear it means the reason is a VM_FAULT_MISSING
-		 * fault.
+		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
+		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
+		 * not set in a UFFD_EVENT_PAGEFAULT, it means it was
+		 * a missing fault, otherwise if set it means it's a
+		 * write protect fault.
 		 */
-		address |= UFFD_BIT_WP;
-	return address;
+		msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+	return msg;
 }
 
 /*
@@ -242,7 +258,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.address = userfault_address(address, flags, reason);
+	uwq.msg = userfault_msg(address, flags, reason);
 	uwq.pending = true;
 	uwq.ctx = ctx;
 
@@ -398,7 +414,7 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 }
 
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
-				    __u64 *addr)
+				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
@@ -416,8 +432,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			 * disappear from under us.
 			 */
 			uwq->pending = false;
-			/* careful to always initialize addr if ret == 0 */
-			*addr = uwq->address;
+			/* careful to always initialize msg if ret == 0 */
+			*msg = uwq->msg;
 			spin_unlock(&ctx->fault_wqh.lock);
 			ret = 0;
 			break;
@@ -447,8 +463,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
 	ssize_t _ret, ret = 0;
-	/* careful to always initialize addr if ret == 0 */
-	__u64 uninitialized_var(addr);
+	struct uffd_msg msg;
 	int no_wait = file->f_flags & O_NONBLOCK;
 
 	if (ctx->state == UFFD_STATE_WAIT_API)
@@ -456,16 +471,16 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 	BUG_ON(ctx->state != UFFD_STATE_RUNNING);
 
 	for (;;) {
-		if (count < sizeof(addr))
+		if (count < sizeof(msg))
 			return ret ? ret : -EINVAL;
-		_ret = userfaultfd_ctx_read(ctx, no_wait, &addr);
+		_ret = userfaultfd_ctx_read(ctx, no_wait, &msg);
 		if (_ret < 0)
 			return ret ? ret : _ret;
-		if (put_user(addr, (__u64 __user *) buf))
+		if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg)))
 			return ret ? ret : -EFAULT;
-		ret += sizeof(addr);
-		buf += sizeof(addr);
-		count -= sizeof(addr);
+		ret += sizeof(msg);
+		buf += sizeof(msg);
+		count -= sizeof(msg);
 		/*
 		 * Allow to read more than one fault at time but only
 		 * block if waiting for the very first one.
@@ -873,17 +888,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	if (ctx->state != UFFD_STATE_WAIT_API)
 		goto out;
 	ret = -EFAULT;
-	if (copy_from_user(&uffdio_api, buf, sizeof(__u64)))
+	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
 		goto out;
-	if (uffdio_api.api != UFFD_API) {
-		/* careful not to leak info, we only read the first 8 bytes */
+	if (uffdio_api.api != UFFD_API || uffdio_api.features) {
 		memset(&uffdio_api, 0, sizeof(uffdio_api));
 		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
 			goto out;
 		ret = -EINVAL;
 		goto out;
 	}
-	/* careful not to leak info, we only read the first 8 bytes */
 	uffdio_api.features = UFFD_API_FEATURES;
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 330206016249..a5f8825381ef 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -11,9 +11,15 @@
 
 #include <linux/types.h>
 
+#include <linux/compiler.h>
+
 #define UFFD_API ((__u64)0xAA)
-/* FIXME: add "|UFFD_FEATURE_WP" to UFFD_API_FEATURES after implementing it */
-#define UFFD_API_FEATURES (UFFD_FEATURE_WRITE_BIT)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *			      UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
 #define UFFD_API_IOCTLS				\
 	((__u64)1 << _UFFDIO_REGISTER |		\
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
@@ -45,26 +51,60 @@
 #define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
 				     struct uffdio_range)
 
-/*
- * Valid bits below PAGE_SHIFT in the userfault address read through
- * the read() syscall.
- */
-#define UFFD_BIT_WRITE	(1<<0)	/* this was a write fault, MISSING or WP */
-#define UFFD_BIT_WP	(1<<1)	/* handle_userfault() reason VM_UFFD_WP */
-#define UFFD_BITS	2	/* two above bits used for UFFD_BIT_* mask */
+/* read() structure */
+struct uffd_msg {
+	__u8	event;
+
+	__u8	reserved1;
+	__u16	reserved2;
+	__u32	reserved3;
+
+	union {
+		struct {
+			__u64	flags;
+			__u64	address;
+		} pagefault;
+
+		struct {
+			/* unused reserved fields */
+			__u64	reserved1;
+			__u64	reserved2;
+			__u64	reserved3;
+		} reserved;
+	} arg;
+} __packed;
 
 /*
- * Features reported in uffdio_api.features field
+ * Start at 0x12 and not at 0 to be more strict against bugs.
  */
-#define UFFD_FEATURE_WRITE_BIT	(1<<0) /* Corresponds to UFFD_BIT_WRITE */
-#define UFFD_FEATURE_WP_BIT	(1<<1) /* Corresponds to UFFD_BIT_WP */
+#define UFFD_EVENT_PAGEFAULT	0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK		0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
 
 struct uffdio_api {
-	/* userland asks for an API number */
+	/* userland asks for an API number and the features to enable */
 	__u64 api;
-
-	/* kernel answers below with the available features for the API */
+	/*
+	 * Kernel answers below with the all available features for
+	 * the API, this notifies userland of which events and/or
+	 * which flags for each event are enabled in the current
+	 * kernel.
+	 *
+	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+	 * are to be considered implicitly always enabled in all kernels as
+	 * long as the uffdio_api.api requested matches UFFD_API.
+	 */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
+#define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#endif
 	__u64 features;
+
 	__u64 ioctls;
 };
 
-- 
cgit v1.2.3


From ba85c702e4b247393ffe9e3fbc13d8aee7b02059 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:41 -0700
Subject: userfaultfd: wake pending userfaults

This is an optimization but it's a userland visible one and it affects
the API.

The downside of this optimization is that if you call poll() and you
get POLLIN, read(ufd) may still return -EAGAIN. The blocked userfault
may be waken by a different thread, before read(ufd) comes
around. This in short means that poll() isn't really usable if the
userfaultfd is opened in blocking mode.

userfaults won't wait in "pending" state to be read anymore and any
UFFDIO_WAKE or similar operations that has the objective of waking
userfaults after their resolution, will wake all blocked userfaults
for the resolved range, including those that haven't been read() by
userland yet.

The behavior of poll() becomes not standard, but this obviates the
need of "spurious" UFFDIO_WAKE and it lets the userland threads to
restart immediately without requiring an UFFDIO_WAKE. This is even
more significant in case of repeated faults on the same address from
multiple threads.

This optimization is justified by the measurement that the number of
spurious UFFDIO_WAKE accounts for 5% and 10% of the total
userfaults for heavy workloads, so it's worth optimizing those away.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 65 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1f2ddaaf3c03..0877222dfa47 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -52,6 +52,10 @@ struct userfaultfd_ctx {
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
+	/*
+	 * Only relevant when queued in fault_wqh and only used by the
+	 * read operation to avoid reading the same userfault twice.
+	 */
 	bool pending;
 	struct userfaultfd_ctx *ctx;
 };
@@ -71,9 +75,6 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 
 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 	ret = 0;
-	/* don't wake the pending ones to avoid reads to block */
-	if (uwq->pending && !ACCESS_ONCE(uwq->ctx->released))
-		goto out;
 	/* len == 0 means wake all */
 	start = range->start;
 	len = range->len;
@@ -196,12 +197,14 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	struct mm_struct *mm = vma->vm_mm;
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue uwq;
+	int ret;
 
 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
+	ret = VM_FAULT_SIGBUS;
 	ctx = vma->vm_userfaultfd_ctx.ctx;
 	if (!ctx)
-		return VM_FAULT_SIGBUS;
+		goto out;
 
 	BUG_ON(ctx->mm != mm);
 
@@ -214,7 +217,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 * caller of handle_userfault to release the mmap_sem.
 	 */
 	if (unlikely(ACCESS_ONCE(ctx->released)))
-		return VM_FAULT_SIGBUS;
+		goto out;
 
 	/*
 	 * Check that we can return VM_FAULT_RETRY.
@@ -240,15 +243,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 			dump_stack();
 		}
 #endif
-		return VM_FAULT_SIGBUS;
+		goto out;
 	}
 
 	/*
 	 * Handle nowait, not much to do other than tell it to retry
 	 * and wait.
 	 */
+	ret = VM_FAULT_RETRY;
 	if (flags & FAULT_FLAG_RETRY_NOWAIT)
-		return VM_FAULT_RETRY;
+		goto out;
 
 	/* take the reference before dropping the mmap_sem */
 	userfaultfd_ctx_get(ctx);
@@ -268,21 +272,23 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 * through poll/read().
 	 */
 	__add_wait_queue(&ctx->fault_wqh, &uwq.wq);
-	for (;;) {
-		set_current_state(TASK_KILLABLE);
-		if (!uwq.pending || ACCESS_ONCE(ctx->released) ||
-		    fatal_signal_pending(current))
-			break;
-		spin_unlock(&ctx->fault_wqh.lock);
+	set_current_state(TASK_KILLABLE);
+	spin_unlock(&ctx->fault_wqh.lock);
 
+	if (likely(!ACCESS_ONCE(ctx->released) &&
+		   !fatal_signal_pending(current))) {
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
+		ret |= VM_FAULT_MAJOR;
+	}
 
+	__set_current_state(TASK_RUNNING);
+	/* see finish_wait() comment for why list_empty_careful() */
+	if (!list_empty_careful(&uwq.wq.task_list)) {
 		spin_lock(&ctx->fault_wqh.lock);
+		list_del_init(&uwq.wq.task_list);
+		spin_unlock(&ctx->fault_wqh.lock);
 	}
-	__remove_wait_queue(&ctx->fault_wqh, &uwq.wq);
-	__set_current_state(TASK_RUNNING);
-	spin_unlock(&ctx->fault_wqh.lock);
 
 	/*
 	 * ctx may go away after this if the userfault pseudo fd is
@@ -290,7 +296,8 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 */
 	userfaultfd_ctx_put(ctx);
 
-	return VM_FAULT_RETRY;
+out:
+	return ret;
 }
 
 static int userfaultfd_release(struct inode *inode, struct file *file)
@@ -404,6 +411,12 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 	case UFFD_STATE_WAIT_API:
 		return POLLERR;
 	case UFFD_STATE_RUNNING:
+		/*
+		 * poll() never guarantees that read won't block.
+		 * userfaults can be waken before they're read().
+		 */
+		if (unlikely(!(file->f_flags & O_NONBLOCK)))
+			return POLLERR;
 		spin_lock(&ctx->fault_wqh.lock);
 		ret = find_userfault(ctx, NULL);
 		spin_unlock(&ctx->fault_wqh.lock);
@@ -834,11 +847,19 @@ out:
 }
 
 /*
- * This is mostly needed to re-wakeup those userfaults that were still
- * pending when userland wake them up the first time. We don't wake
- * the pending one to avoid blocking reads to block, or non blocking
- * read to return -EAGAIN, if used with POLLIN, to avoid userland
- * doubts on why POLLIN wasn't reliable.
+ * userfaultfd_wake is needed in case an userfault is in flight by the
+ * time a UFFDIO_COPY (or other ioctl variants) completes. The page
+ * may be well get mapped and the page fault if repeated wouldn't lead
+ * to a userfault anymore, but before scheduling in TASK_KILLABLE mode
+ * handle_userfault() doesn't recheck the pagetables and it doesn't
+ * serialize against UFFDO_COPY (or other ioctl variants). Ultimately
+ * the knowledge of which pages are mapped is left to userland who is
+ * responsible for handling the race between read() userfaults and
+ * background UFFDIO_COPY (or other ioctl variants), if done by
+ * separate concurrent threads.
+ *
+ * userfaultfd_wake may be used in combination with the
+ * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
  */
 static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
 			    unsigned long arg)
-- 
cgit v1.2.3


From 15b726ef048b31a24b3fefb6863083a25fe34800 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:44 -0700
Subject: userfaultfd: optimize read() and poll() to be O(1)

This makes read O(1) and poll that was already O(1) becomes lockless.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 185 +++++++++++++++++++++++++++++++++----------------------
 1 file changed, 111 insertions(+), 74 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0877222dfa47..232cbf37c59f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -35,7 +35,9 @@ enum userfaultfd_state {
 struct userfaultfd_ctx {
 	/* pseudo fd refcounting */
 	atomic_t refcount;
-	/* waitqueue head for the userfaultfd page faults */
+	/* waitqueue head for the pending (i.e. not read) userfaults */
+	wait_queue_head_t fault_pending_wqh;
+	/* waitqueue head for the userfaults */
 	wait_queue_head_t fault_wqh;
 	/* waitqueue head for the pseudo fd to wakeup poll/read */
 	wait_queue_head_t fd_wqh;
@@ -52,11 +54,6 @@ struct userfaultfd_ctx {
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
-	/*
-	 * Only relevant when queued in fault_wqh and only used by the
-	 * read operation to avoid reading the same userfault twice.
-	 */
-	bool pending;
 	struct userfaultfd_ctx *ctx;
 };
 
@@ -263,17 +260,21 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
 	uwq.msg = userfault_msg(address, flags, reason);
-	uwq.pending = true;
 	uwq.ctx = ctx;
 
-	spin_lock(&ctx->fault_wqh.lock);
+	spin_lock(&ctx->fault_pending_wqh.lock);
 	/*
 	 * After the __add_wait_queue the uwq is visible to userland
 	 * through poll/read().
 	 */
-	__add_wait_queue(&ctx->fault_wqh, &uwq.wq);
+	__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
+	/*
+	 * The smp_mb() after __set_current_state prevents the reads
+	 * following the spin_unlock to happen before the list_add in
+	 * __add_wait_queue.
+	 */
 	set_current_state(TASK_KILLABLE);
-	spin_unlock(&ctx->fault_wqh.lock);
+	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	if (likely(!ACCESS_ONCE(ctx->released) &&
 		   !fatal_signal_pending(current))) {
@@ -283,11 +284,28 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	}
 
 	__set_current_state(TASK_RUNNING);
-	/* see finish_wait() comment for why list_empty_careful() */
+
+	/*
+	 * Here we race with the list_del; list_add in
+	 * userfaultfd_ctx_read(), however because we don't ever run
+	 * list_del_init() to refile across the two lists, the prev
+	 * and next pointers will never point to self. list_add also
+	 * would never let any of the two pointers to point to
+	 * self. So list_empty_careful won't risk to see both pointers
+	 * pointing to self at any time during the list refile. The
+	 * only case where list_del_init() is called is the full
+	 * removal in the wake function and there we don't re-list_add
+	 * and it's fine not to block on the spinlock. The uwq on this
+	 * kernel stack can be released after the list_del_init.
+	 */
 	if (!list_empty_careful(&uwq.wq.task_list)) {
-		spin_lock(&ctx->fault_wqh.lock);
-		list_del_init(&uwq.wq.task_list);
-		spin_unlock(&ctx->fault_wqh.lock);
+		spin_lock(&ctx->fault_pending_wqh.lock);
+		/*
+		 * No need of list_del_init(), the uwq on the stack
+		 * will be freed shortly anyway.
+		 */
+		list_del(&uwq.wq.task_list);
+		spin_unlock(&ctx->fault_pending_wqh.lock);
 	}
 
 	/*
@@ -345,59 +363,38 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	up_write(&mm->mmap_sem);
 
 	/*
-	 * After no new page faults can wait on this fault_wqh, flush
+	 * After no new page faults can wait on this fault_*wqh, flush
 	 * the last page faults that may have been already waiting on
-	 * the fault_wqh.
+	 * the fault_*wqh.
 	 */
-	spin_lock(&ctx->fault_wqh.lock);
+	spin_lock(&ctx->fault_pending_wqh.lock);
+	__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
 	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
-	spin_unlock(&ctx->fault_wqh.lock);
+	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	wake_up_poll(&ctx->fd_wqh, POLLHUP);
 	userfaultfd_ctx_put(ctx);
 	return 0;
 }
 
-/* fault_wqh.lock must be hold by the caller */
-static inline unsigned int find_userfault(struct userfaultfd_ctx *ctx,
-					  struct userfaultfd_wait_queue **uwq)
+/* fault_pending_wqh.lock must be hold by the caller */
+static inline struct userfaultfd_wait_queue *find_userfault(
+	struct userfaultfd_ctx *ctx)
 {
 	wait_queue_t *wq;
-	struct userfaultfd_wait_queue *_uwq;
-	unsigned int ret = 0;
-
-	VM_BUG_ON(!spin_is_locked(&ctx->fault_wqh.lock));
+	struct userfaultfd_wait_queue *uwq;
 
-	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
-		_uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
-		if (_uwq->pending) {
-			ret = POLLIN;
-			if (!uwq)
-				/*
-				 * If there's at least a pending and
-				 * we don't care which one it is,
-				 * break immediately and leverage the
-				 * efficiency of the LIFO walk.
-				 */
-				break;
-			/*
-			 * If we need to find which one was pending we
-			 * keep walking until we find the first not
-			 * pending one, so we read() them in FIFO order.
-			 */
-			*uwq = _uwq;
-		} else
-			/*
-			 * break the loop at the first not pending
-			 * one, there cannot be pending userfaults
-			 * after the first not pending one, because
-			 * all new pending ones are inserted at the
-			 * head and we walk it in LIFO.
-			 */
-			break;
-	}
+	VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
 
-	return ret;
+	uwq = NULL;
+	if (!waitqueue_active(&ctx->fault_pending_wqh))
+		goto out;
+	/* walk in reverse to provide FIFO behavior to read userfaults */
+	wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
+			     typeof(*wq), task_list);
+	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+out:
+	return uwq;
 }
 
 static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
@@ -417,9 +414,20 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 		 */
 		if (unlikely(!(file->f_flags & O_NONBLOCK)))
 			return POLLERR;
-		spin_lock(&ctx->fault_wqh.lock);
-		ret = find_userfault(ctx, NULL);
-		spin_unlock(&ctx->fault_wqh.lock);
+		/*
+		 * lockless access to see if there are pending faults
+		 * __pollwait last action is the add_wait_queue but
+		 * the spin_unlock would allow the waitqueue_active to
+		 * pass above the actual list_add inside
+		 * add_wait_queue critical section. So use a full
+		 * memory barrier to serialize the list_add write of
+		 * add_wait_queue() with the waitqueue_active read
+		 * below.
+		 */
+		ret = 0;
+		smp_mb();
+		if (waitqueue_active(&ctx->fault_pending_wqh))
+			ret = POLLIN;
 		return ret;
 	default:
 		BUG();
@@ -431,27 +439,47 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
-	struct userfaultfd_wait_queue *uwq = NULL;
+	struct userfaultfd_wait_queue *uwq;
 
-	/* always take the fd_wqh lock before the fault_wqh lock */
+	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
 	__add_wait_queue(&ctx->fd_wqh, &wait);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		spin_lock(&ctx->fault_wqh.lock);
-		if (find_userfault(ctx, &uwq)) {
+		spin_lock(&ctx->fault_pending_wqh.lock);
+		uwq = find_userfault(ctx);
+		if (uwq) {
 			/*
-			 * The fault_wqh.lock prevents the uwq to
-			 * disappear from under us.
+			 * The fault_pending_wqh.lock prevents the uwq
+			 * to disappear from under us.
+			 *
+			 * Refile this userfault from
+			 * fault_pending_wqh to fault_wqh, it's not
+			 * pending anymore after we read it.
+			 *
+			 * Use list_del() by hand (as
+			 * userfaultfd_wake_function also uses
+			 * list_del_init() by hand) to be sure nobody
+			 * changes __remove_wait_queue() to use
+			 * list_del_init() in turn breaking the
+			 * !list_empty_careful() check in
+			 * handle_userfault(). The uwq->wq.task_list
+			 * must never be empty at any time during the
+			 * refile, or the waitqueue could disappear
+			 * from under us. The "wait_queue_head_t"
+			 * parameter of __remove_wait_queue() is unused
+			 * anyway.
 			 */
-			uwq->pending = false;
+			list_del(&uwq->wq.task_list);
+			__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
+
 			/* careful to always initialize msg if ret == 0 */
 			*msg = uwq->msg;
-			spin_unlock(&ctx->fault_wqh.lock);
+			spin_unlock(&ctx->fault_pending_wqh.lock);
 			ret = 0;
 			break;
 		}
-		spin_unlock(&ctx->fault_wqh.lock);
+		spin_unlock(&ctx->fault_pending_wqh.lock);
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
@@ -510,10 +538,14 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
 	start = range->start;
 	end = range->start + range->len;
 
-	spin_lock(&ctx->fault_wqh.lock);
+	spin_lock(&ctx->fault_pending_wqh.lock);
 	/* wake all in the range and autoremove */
-	__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
-	spin_unlock(&ctx->fault_wqh.lock);
+	if (waitqueue_active(&ctx->fault_pending_wqh))
+		__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+				     range);
+	if (waitqueue_active(&ctx->fault_wqh))
+		__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+	spin_unlock(&ctx->fault_pending_wqh.lock);
 }
 
 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
@@ -534,7 +566,8 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
 	 * userfaults yet. So we take the spinlock only when we're
 	 * sure we've userfaults to wake.
 	 */
-	if (waitqueue_active(&ctx->fault_wqh))
+	if (waitqueue_active(&ctx->fault_pending_wqh) ||
+	    waitqueue_active(&ctx->fault_wqh))
 		__wake_userfault(ctx, range);
 }
 
@@ -960,14 +993,17 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
 	struct userfaultfd_wait_queue *uwq;
 	unsigned long pending = 0, total = 0;
 
-	spin_lock(&ctx->fault_wqh.lock);
+	spin_lock(&ctx->fault_pending_wqh.lock);
+	list_for_each_entry(wq, &ctx->fault_pending_wqh.task_list, task_list) {
+		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
+		pending++;
+		total++;
+	}
 	list_for_each_entry(wq, &ctx->fault_wqh.task_list, task_list) {
 		uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
-		if (uwq->pending)
-			pending++;
 		total++;
 	}
-	spin_unlock(&ctx->fault_wqh.lock);
+	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	/*
 	 * If more protocols will be added, there will be all shown
@@ -1027,6 +1063,7 @@ static struct file *userfaultfd_file_create(int flags)
 		goto out;
 
 	atomic_set(&ctx->refcount, 1);
+	init_waitqueue_head(&ctx->fault_pending_wqh);
 	init_waitqueue_head(&ctx->fault_wqh);
 	init_waitqueue_head(&ctx->fd_wqh);
 	ctx->flags = flags;
-- 
cgit v1.2.3


From 3004ec9cabf49f43fae2b2bd1855a4720f1def7a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:48 -0700
Subject: userfaultfd: allocate the userfaultfd_ctx cacheline aligned

Use proper slab to guarantee alignment.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 39 +++++++++++++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 232cbf37c59f..8977a4e8a7f8 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -27,20 +27,26 @@
 #include <linux/ioctl.h>
 #include <linux/security.h>
 
+static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
+
 enum userfaultfd_state {
 	UFFD_STATE_WAIT_API,
 	UFFD_STATE_RUNNING,
 };
 
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ */
 struct userfaultfd_ctx {
-	/* pseudo fd refcounting */
-	atomic_t refcount;
 	/* waitqueue head for the pending (i.e. not read) userfaults */
 	wait_queue_head_t fault_pending_wqh;
 	/* waitqueue head for the userfaults */
 	wait_queue_head_t fault_wqh;
 	/* waitqueue head for the pseudo fd to wakeup poll/read */
 	wait_queue_head_t fd_wqh;
+	/* pseudo fd refcounting */
+	atomic_t refcount;
 	/* userfaultfd syscall flags */
 	unsigned int flags;
 	/* state machine */
@@ -130,7 +136,7 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
 		mmput(ctx->mm);
-		kfree(ctx);
+		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 	}
 }
 
@@ -1028,6 +1034,15 @@ static const struct file_operations userfaultfd_fops = {
 	.llseek		= noop_llseek,
 };
 
+static void init_once_userfaultfd_ctx(void *mem)
+{
+	struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;
+
+	init_waitqueue_head(&ctx->fault_pending_wqh);
+	init_waitqueue_head(&ctx->fault_wqh);
+	init_waitqueue_head(&ctx->fd_wqh);
+}
+
 /**
  * userfaultfd_file_create - Creates an userfaultfd file pointer.
  * @flags: Flags for the userfaultfd file.
@@ -1058,14 +1073,11 @@ static struct file *userfaultfd_file_create(int flags)
 		goto out;
 
 	file = ERR_PTR(-ENOMEM);
-	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+	ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
 	if (!ctx)
 		goto out;
 
 	atomic_set(&ctx->refcount, 1);
-	init_waitqueue_head(&ctx->fault_pending_wqh);
-	init_waitqueue_head(&ctx->fault_wqh);
-	init_waitqueue_head(&ctx->fd_wqh);
 	ctx->flags = flags;
 	ctx->state = UFFD_STATE_WAIT_API;
 	ctx->released = false;
@@ -1076,7 +1088,7 @@ static struct file *userfaultfd_file_create(int flags)
 	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
 				  O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
 	if (IS_ERR(file))
-		kfree(ctx);
+		kmem_cache_free(userfaultfd_ctx_cachep, ctx);
 out:
 	return file;
 }
@@ -1105,3 +1117,14 @@ err_put_unused_fd:
 
 	return error;
 }
+
+static int __init userfaultfd_init(void)
+{
+	userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
+						sizeof(struct userfaultfd_ctx),
+						0,
+						SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+						init_once_userfaultfd_ctx);
+	return 0;
+}
+__initcall(userfaultfd_init);
-- 
cgit v1.2.3


From 8d2afd96c20316d112e04d935d9e09150e988397 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:51 -0700
Subject: userfaultfd: solve the race between UFFDIO_COPY|ZEROPAGE and read

Solve in-kernel the race between UFFDIO_COPY|ZEROPAGE and
userfaultfd_read if they are run on different threads simultaneously.

Until now qemu solved the race in userland: the race was explicitly
and intentionally left for userland to solve. However we can also
solve it in kernel.

Requiring all users to solve this race if they use two threads (one
for the background transfer and one for the userfault reads) isn't
very attractive from an API prospective, furthermore this allows to
remove a whole bunch of mutex and bitmap code from qemu, making it
faster. The cost of __get_user_pages_fast should be insignificant
considering it scales perfectly and the pagetables are already hot in
the CPU cache, compared to the overhead in userland to maintain those
structures.

Applying this patch is backwards compatible with respect to the
userfaultfd userland API, however reverting this change wouldn't be
backwards compatible anymore.

Without this patch qemu in the background transfer thread, has to read
the old state, and do UFFDIO_WAKE if old_state is missing but it
become REQUESTED by the time it tries to set it to RECEIVED (signaling
the other side received an userfault).

    vcpu                background_thr userfault_thr
    -----               -----          -----
    vcpu0 handle_mm_fault()

                        postcopy_place_page
                        read old_state -> MISSING
                        UFFDIO_COPY 0x7fb76a139000 (no wakeup, still pending)

    vcpu0 fault at 0x7fb76a139000 enters handle_userfault
    poll() is kicked

                                        poll() -> POLLIN
                                        read() -> 0x7fb76a139000
                                        postcopy_pmi_change_state(MISSING, REQUESTED) -> REQUESTED

                        tmp_state = postcopy_pmi_change_state(old_state, RECEIVED) -> REQUESTED
                        /* check that no userfault raced with UFFDIO_COPY */
                        if (old_state == MISSING && tmp_state == REQUESTED)
                                UFFDIO_WAKE from background thread

And a second case where a UFFDIO_WAKE would be needed is in the userfault thread:

    vcpu                background_thr userfault_thr
    -----               -----          -----
    vcpu0 handle_mm_fault()

                        postcopy_place_page
                        read old_state -> MISSING
                        UFFDIO_COPY 0x7fb76a139000 (no wakeup, still pending)
                        tmp_state = postcopy_pmi_change_state(old_state, RECEIVED) -> RECEIVED

    vcpu0 fault at 0x7fb76a139000 enters handle_userfault
    poll() is kicked

                                        poll() -> POLLIN
                                        read() -> 0x7fb76a139000

                                        if (postcopy_pmi_change_state(MISSING, REQUESTED) == RECEIVED)
                                                UFFDIO_WAKE from userfault thread

This patch removes the need of both UFFDIO_WAKE and of the associated
per-page tristate as well.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 81 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 66 insertions(+), 15 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8977a4e8a7f8..febbd2b165df 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -179,6 +179,67 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 	return msg;
 }
 
+/*
+ * Verify the pagetables are still not ok after having reigstered into
+ * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
+ * userfault that has already been resolved, if userfaultfd_read and
+ * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
+ * threads.
+ */
+static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
+					 unsigned long address,
+					 unsigned long flags,
+					 unsigned long reason)
+{
+	struct mm_struct *mm = ctx->mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd, _pmd;
+	pte_t *pte;
+	bool ret = true;
+
+	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+	pmd = pmd_offset(pud, address);
+	/*
+	 * READ_ONCE must function as a barrier with narrower scope
+	 * and it must be equivalent to:
+	 *	_pmd = *pmd; barrier();
+	 *
+	 * This is to deal with the instability (as in
+	 * pmd_trans_unstable) of the pmd.
+	 */
+	_pmd = READ_ONCE(*pmd);
+	if (!pmd_present(_pmd))
+		goto out;
+
+	ret = false;
+	if (pmd_trans_huge(_pmd))
+		goto out;
+
+	/*
+	 * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it
+	 * and use the standard pte_offset_map() instead of parsing _pmd.
+	 */
+	pte = pte_offset_map(pmd, address);
+	/*
+	 * Lockless access: we're in a wait_event so it's ok if it
+	 * changes under us.
+	 */
+	if (pte_none(*pte))
+		ret = true;
+	pte_unmap(pte);
+
+out:
+	return ret;
+}
+
 /*
  * The locking rules involved in returning VM_FAULT_RETRY depending on
  * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
@@ -201,6 +262,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue uwq;
 	int ret;
+	bool must_wait;
 
 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
@@ -260,9 +322,6 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	/* take the reference before dropping the mmap_sem */
 	userfaultfd_ctx_get(ctx);
 
-	/* be gentle and immediately relinquish the mmap_sem */
-	up_read(&mm->mmap_sem);
-
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
 	uwq.msg = userfault_msg(address, flags, reason);
@@ -282,7 +341,10 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	set_current_state(TASK_KILLABLE);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 
-	if (likely(!ACCESS_ONCE(ctx->released) &&
+	must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
+	up_read(&mm->mmap_sem);
+
+	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
 		   !fatal_signal_pending(current))) {
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
@@ -886,17 +948,6 @@ out:
 }
 
 /*
- * userfaultfd_wake is needed in case an userfault is in flight by the
- * time a UFFDIO_COPY (or other ioctl variants) completes. The page
- * may be well get mapped and the page fault if repeated wouldn't lead
- * to a userfault anymore, but before scheduling in TASK_KILLABLE mode
- * handle_userfault() doesn't recheck the pagetables and it doesn't
- * serialize against UFFDO_COPY (or other ioctl variants). Ultimately
- * the knowledge of which pages are mapped is left to userland who is
- * responsible for handling the race between read() userfaults and
- * background UFFDIO_COPY (or other ioctl variants), if done by
- * separate concurrent threads.
- *
  * userfaultfd_wake may be used in combination with the
  * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
  */
-- 
cgit v1.2.3


From a14c151e567cb2c3e62611da808a8bdab86fdee5 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:54 -0700
Subject: userfaultfd: buildsystem activation

This allows to select the userfaultfd during configuration to build it.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile  |  1 +
 init/Kconfig | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/fs/Makefile b/fs/Makefile
index 09e051fefc5b..f79cf4043e60 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
 obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)		+= timerfd.o
 obj-$(CONFIG_EVENTFD)		+= eventfd.o
+obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
diff --git a/init/Kconfig b/init/Kconfig
index bb9b4dd55889..161acd8bc56f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1576,6 +1576,17 @@ config ADVISE_SYSCALLS
 	  applications use these syscalls, you can disable this option to save
 	  space.
 
+config USERFAULTFD
+	bool "Enable userfaultfd() system call"
+	select ANON_INODES
+	default y
+	depends on MMU
+	help
+	  Enable the userfaultfd() system call that allows to intercept and
+	  handle page faults in userland.
+
+	  If unsure, say Y.
+
 config PCI_QUIRKS
 	default y
 	bool "Enable PCI quirk workarounds" if EXPERT
-- 
cgit v1.2.3


From 1380fca084743fef8d17e59b273473393944ce58 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:46:58 -0700
Subject: userfaultfd: activate syscall

This activates the userfaultfd syscall.

[sfr@canb.auug.org.au: activate syscall fix]
[akpm@linux-foundation.org: don't enable userfaultfd on powerpc]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/entry/syscalls/syscall_32.tbl | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 include/linux/syscalls.h               | 1 +
 kernel/sys_ni.c                        | 1 +
 4 files changed, 4 insertions(+)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 25e3cf1cd8fd..477bfa6db370 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -380,3 +380,4 @@
 371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
 372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
 373	i386	shutdown		sys_shutdown
+374	i386	userfaultfd		sys_userfaultfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..81c490634db9 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -329,6 +329,7 @@
 320	common	kexec_file_load		sys_kexec_file_load
 321	common	bpf			sys_bpf
 322	64	execveat		stub_execveat
+323	common	userfaultfd		sys_userfaultfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b45c45b8c829..08001317aee7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_eventfd2(unsigned int count, int flags);
 asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
+asmlinkage long sys_userfaultfd(int flags);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
 asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
 cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
 
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
-- 
cgit v1.2.3


From 1f1c6f075904c241f9e44eb37efa8777141fc938 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:01 -0700
Subject: userfaultfd: UFFDIO_COPY|UFFDIO_ZEROPAGE uAPI

This implements the uABI of UFFDIO_COPY and UFFDIO_ZEROPAGE.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/userfaultfd.h | 42 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index a5f8825381ef..df0e09bb7dd5 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -25,7 +25,9 @@
 	 (__u64)1 << _UFFDIO_UNREGISTER |	\
 	 (__u64)1 << _UFFDIO_API)
 #define UFFD_API_RANGE_IOCTLS			\
-	((__u64)1 << _UFFDIO_WAKE)
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_ZEROPAGE)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -38,6 +40,8 @@
 #define _UFFDIO_REGISTER		(0x00)
 #define _UFFDIO_UNREGISTER		(0x01)
 #define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_COPY			(0x03)
+#define _UFFDIO_ZEROPAGE		(0x04)
 #define _UFFDIO_API			(0x3F)
 
 /* userfaultfd ioctl ids */
@@ -50,6 +54,10 @@
 				     struct uffdio_range)
 #define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
 				     struct uffdio_range)
+#define UFFDIO_COPY		_IOWR(UFFDIO, _UFFDIO_COPY,	\
+				      struct uffdio_copy)
+#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
+				      struct uffdio_zeropage)
 
 /* read() structure */
 struct uffd_msg {
@@ -126,4 +134,36 @@ struct uffdio_register {
 	__u64 ioctls;
 };
 
+struct uffdio_copy {
+	__u64 dst;
+	__u64 src;
+	__u64 len;
+	/*
+	 * There will be a wrprotection flag later that allows to map
+	 * pages wrprotected on the fly. And such a flag will be
+	 * available if the wrprotection ioctl are implemented for the
+	 * range according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_COPY_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "copy" is written by the ioctl and must be at the end: the
+	 * copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 copy;
+};
+
+struct uffdio_zeropage {
+	struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "zeropage" is written by the ioctl and must be at the end:
+	 * the copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 zeropage;
+};
+
 #endif /* _LINUX_USERFAULTFD_H */
-- 
cgit v1.2.3


From c1a4de99fada21e2e9251e52cbb51eff5aadc757 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:04 -0700
Subject: userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE
 preparation

This implements mcopy_atomic and mfill_zeropage that are the lowlevel
VM methods that are invoked respectively by the UFFDIO_COPY and
UFFDIO_ZEROPAGE userfaultfd commands.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/userfaultfd_k.h |   6 +
 mm/Makefile                   |   1 +
 mm/userfaultfd.c              | 269 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 276 insertions(+)
 create mode 100644 mm/userfaultfd.c

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e1e43609a179..587480ad41b7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -30,6 +30,12 @@
 extern int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 			    unsigned int flags, unsigned long reason);
 
+extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+			    unsigned long src_start, unsigned long len);
+extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
+			      unsigned long dst_start,
+			      unsigned long len);
+
 /* mm helpers */
 static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
 					struct vm_userfaultfd_ctx vm_ctx)
diff --git a/mm/Makefile b/mm/Makefile
index 98c4eaeabdcb..b424d5e5b6ff 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -78,3 +78,4 @@ obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
new file mode 100644
index 000000000000..c54c761609fc
--- /dev/null
+++ b/mm/userfaultfd.c
@@ -0,0 +1,269 @@
+/*
+ *  mm/userfaultfd.c
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/userfaultfd_k.h>
+#include <linux/mmu_notifier.h>
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+static int mcopy_atomic_pte(struct mm_struct *dst_mm,
+			    pmd_t *dst_pmd,
+			    struct vm_area_struct *dst_vma,
+			    unsigned long dst_addr,
+			    unsigned long src_addr)
+{
+	struct mem_cgroup *memcg;
+	pte_t _dst_pte, *dst_pte;
+	spinlock_t *ptl;
+	struct page *page;
+	void *page_kaddr;
+	int ret;
+
+	ret = -ENOMEM;
+	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+	if (!page)
+		goto out;
+
+	page_kaddr = kmap(page);
+	ret = -EFAULT;
+	if (copy_from_user(page_kaddr, (const void __user *) src_addr,
+			   PAGE_SIZE))
+		goto out_kunmap_release;
+	kunmap(page);
+
+	/*
+	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * preceeding stores to the page contents become visible before
+	 * the set_pte_at() write.
+	 */
+	__SetPageUptodate(page);
+
+	ret = -ENOMEM;
+	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+		goto out_release;
+
+	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+	if (dst_vma->vm_flags & VM_WRITE)
+		_dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+
+	ret = -EEXIST;
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	if (!pte_none(*dst_pte))
+		goto out_release_uncharge_unlock;
+
+	inc_mm_counter(dst_mm, MM_ANONPAGES);
+	page_add_new_anon_rmap(page, dst_vma, dst_addr);
+	mem_cgroup_commit_charge(page, memcg, false);
+	lru_cache_add_active_or_unevictable(page, dst_vma);
+
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+
+	pte_unmap_unlock(dst_pte, ptl);
+	ret = 0;
+out:
+	return ret;
+out_release_uncharge_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+	mem_cgroup_cancel_charge(page, memcg);
+out_release:
+	page_cache_release(page);
+	goto out;
+out_kunmap_release:
+	kunmap(page);
+	goto out_release;
+}
+
+static int mfill_zeropage_pte(struct mm_struct *dst_mm,
+			      pmd_t *dst_pmd,
+			      struct vm_area_struct *dst_vma,
+			      unsigned long dst_addr)
+{
+	pte_t _dst_pte, *dst_pte;
+	spinlock_t *ptl;
+	int ret;
+
+	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+					 dst_vma->vm_page_prot));
+	ret = -EEXIST;
+	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+	if (!pte_none(*dst_pte))
+		goto out_unlock;
+	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+	/* No need to invalidate - it was non-present before */
+	update_mmu_cache(dst_vma, dst_addr, dst_pte);
+	ret = 0;
+out_unlock:
+	pte_unmap_unlock(dst_pte, ptl);
+	return ret;
+}
+
+static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd = NULL;
+
+	pgd = pgd_offset(mm, address);
+	pud = pud_alloc(mm, pgd, address);
+	if (pud)
+		/*
+		 * Note that we didn't run this because the pmd was
+		 * missing, the *pmd may be already established and in
+		 * turn it may also be a trans_huge_pmd.
+		 */
+		pmd = pmd_alloc(mm, pud, address);
+	return pmd;
+}
+
+static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+					      unsigned long dst_start,
+					      unsigned long src_start,
+					      unsigned long len,
+					      bool zeropage)
+{
+	struct vm_area_struct *dst_vma;
+	ssize_t err;
+	pmd_t *dst_pmd;
+	unsigned long src_addr, dst_addr;
+	long copied = 0;
+
+	/*
+	 * Sanitize the command parameters:
+	 */
+	BUG_ON(dst_start & ~PAGE_MASK);
+	BUG_ON(len & ~PAGE_MASK);
+
+	/* Does the address range wrap, or is the span zero-sized? */
+	BUG_ON(src_start + len <= src_start);
+	BUG_ON(dst_start + len <= dst_start);
+
+	down_read(&dst_mm->mmap_sem);
+
+	/*
+	 * Make sure the vma is not shared, that the dst range is
+	 * both valid and fully within a single existing vma.
+	 */
+	err = -EINVAL;
+	dst_vma = find_vma(dst_mm, dst_start);
+	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
+		goto out;
+	if (dst_start < dst_vma->vm_start ||
+	    dst_start + len > dst_vma->vm_end)
+		goto out;
+
+	/*
+	 * Be strict and only allow __mcopy_atomic on userfaultfd
+	 * registered ranges to prevent userland errors going
+	 * unnoticed. As far as the VM consistency is concerned, it
+	 * would be perfectly safe to remove this check, but there's
+	 * no useful usage for __mcopy_atomic ouside of userfaultfd
+	 * registered ranges. This is after all why these are ioctls
+	 * belonging to the userfaultfd and not syscalls.
+	 */
+	if (!dst_vma->vm_userfaultfd_ctx.ctx)
+		goto out;
+
+	/*
+	 * FIXME: only allow copying on anonymous vmas, tmpfs should
+	 * be added.
+	 */
+	if (dst_vma->vm_ops)
+		goto out;
+
+	/*
+	 * Ensure the dst_vma has a anon_vma or this page
+	 * would get a NULL anon_vma when moved in the
+	 * dst_vma.
+	 */
+	err = -ENOMEM;
+	if (unlikely(anon_vma_prepare(dst_vma)))
+		goto out;
+
+	for (src_addr = src_start, dst_addr = dst_start;
+	     src_addr < src_start + len; ) {
+		pmd_t dst_pmdval;
+		BUG_ON(dst_addr >= dst_start + len);
+		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
+		if (unlikely(!dst_pmd)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		dst_pmdval = pmd_read_atomic(dst_pmd);
+		/*
+		 * If the dst_pmd is mapped as THP don't
+		 * override it and just be strict.
+		 */
+		if (unlikely(pmd_trans_huge(dst_pmdval))) {
+			err = -EEXIST;
+			break;
+		}
+		if (unlikely(pmd_none(dst_pmdval)) &&
+		    unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
+					 dst_addr))) {
+			err = -ENOMEM;
+			break;
+		}
+		/* If an huge pmd materialized from under us fail */
+		if (unlikely(pmd_trans_huge(*dst_pmd))) {
+			err = -EFAULT;
+			break;
+		}
+
+		BUG_ON(pmd_none(*dst_pmd));
+		BUG_ON(pmd_trans_huge(*dst_pmd));
+
+		if (!zeropage)
+			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
+					       dst_addr, src_addr);
+		else
+			err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
+						 dst_addr);
+
+		cond_resched();
+
+		if (!err) {
+			dst_addr += PAGE_SIZE;
+			src_addr += PAGE_SIZE;
+			copied += PAGE_SIZE;
+
+			if (fatal_signal_pending(current))
+				err = -EINTR;
+		}
+		if (err)
+			break;
+	}
+
+out:
+	up_read(&dst_mm->mmap_sem);
+	BUG_ON(copied < 0);
+	BUG_ON(err > 0);
+	BUG_ON(!copied && !err);
+	return copied ? copied : err;
+}
+
+ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
+		     unsigned long src_start, unsigned long len)
+{
+	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+}
+
+ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
+		       unsigned long len)
+{
+	return __mcopy_atomic(dst_mm, start, 0, len, true);
+}
-- 
cgit v1.2.3


From b6ebaedb4cb1a18220ae626c3a9e184ee39dd248 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:08 -0700
Subject: userfaultfd: avoid mmap_sem read recursion in mcopy_atomic

If the rwsem starves writers it wasn't strictly a bug but lockdep
doesn't like it and this avoids depending on lowlevel implementation
details of the lock.

[akpm@linux-foundation.org: delete weird BUILD_BUG_ON()]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/userfaultfd.c | 91 ++++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 65 insertions(+), 26 deletions(-)

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index c54c761609fc..77fee9325a57 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -21,26 +21,39 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    pmd_t *dst_pmd,
 			    struct vm_area_struct *dst_vma,
 			    unsigned long dst_addr,
-			    unsigned long src_addr)
+			    unsigned long src_addr,
+			    struct page **pagep)
 {
 	struct mem_cgroup *memcg;
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
-	struct page *page;
 	void *page_kaddr;
 	int ret;
+	struct page *page;
 
-	ret = -ENOMEM;
-	page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
-	if (!page)
-		goto out;
-
-	page_kaddr = kmap(page);
-	ret = -EFAULT;
-	if (copy_from_user(page_kaddr, (const void __user *) src_addr,
-			   PAGE_SIZE))
-		goto out_kunmap_release;
-	kunmap(page);
+	if (!*pagep) {
+		ret = -ENOMEM;
+		page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
+		if (!page)
+			goto out;
+
+		page_kaddr = kmap_atomic(page);
+		ret = copy_from_user(page_kaddr,
+				     (const void __user *) src_addr,
+				     PAGE_SIZE);
+		kunmap_atomic(page_kaddr);
+
+		/* fallback to copy_from_user outside mmap_sem */
+		if (unlikely(ret)) {
+			ret = -EFAULT;
+			*pagep = page;
+			/* don't free the page */
+			goto out;
+		}
+	} else {
+		page = *pagep;
+		*pagep = NULL;
+	}
 
 	/*
 	 * The memory barrier inside __SetPageUptodate makes sure that
@@ -82,9 +95,6 @@ out_release_uncharge_unlock:
 out_release:
 	page_cache_release(page);
 	goto out;
-out_kunmap_release:
-	kunmap(page);
-	goto out_release;
 }
 
 static int mfill_zeropage_pte(struct mm_struct *dst_mm,
@@ -139,7 +149,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	ssize_t err;
 	pmd_t *dst_pmd;
 	unsigned long src_addr, dst_addr;
-	long copied = 0;
+	long copied;
+	struct page *page;
 
 	/*
 	 * Sanitize the command parameters:
@@ -151,6 +162,11 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	BUG_ON(src_start + len <= src_start);
 	BUG_ON(dst_start + len <= dst_start);
 
+	src_addr = src_start;
+	dst_addr = dst_start;
+	copied = 0;
+	page = NULL;
+retry:
 	down_read(&dst_mm->mmap_sem);
 
 	/*
@@ -160,10 +176,10 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	err = -EINVAL;
 	dst_vma = find_vma(dst_mm, dst_start);
 	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
-		goto out;
+		goto out_unlock;
 	if (dst_start < dst_vma->vm_start ||
 	    dst_start + len > dst_vma->vm_end)
-		goto out;
+		goto out_unlock;
 
 	/*
 	 * Be strict and only allow __mcopy_atomic on userfaultfd
@@ -175,14 +191,14 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	 * belonging to the userfaultfd and not syscalls.
 	 */
 	if (!dst_vma->vm_userfaultfd_ctx.ctx)
-		goto out;
+		goto out_unlock;
 
 	/*
 	 * FIXME: only allow copying on anonymous vmas, tmpfs should
 	 * be added.
 	 */
 	if (dst_vma->vm_ops)
-		goto out;
+		goto out_unlock;
 
 	/*
 	 * Ensure the dst_vma has a anon_vma or this page
@@ -191,12 +207,13 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 	 */
 	err = -ENOMEM;
 	if (unlikely(anon_vma_prepare(dst_vma)))
-		goto out;
+		goto out_unlock;
 
-	for (src_addr = src_start, dst_addr = dst_start;
-	     src_addr < src_start + len; ) {
+	while (src_addr < src_start + len) {
 		pmd_t dst_pmdval;
+
 		BUG_ON(dst_addr >= dst_start + len);
+
 		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
 		if (unlikely(!dst_pmd)) {
 			err = -ENOMEM;
@@ -229,13 +246,32 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 
 		if (!zeropage)
 			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
-					       dst_addr, src_addr);
+					       dst_addr, src_addr, &page);
 		else
 			err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma,
 						 dst_addr);
 
 		cond_resched();
 
+		if (unlikely(err == -EFAULT)) {
+			void *page_kaddr;
+
+			up_read(&dst_mm->mmap_sem);
+			BUG_ON(!page);
+
+			page_kaddr = kmap(page);
+			err = copy_from_user(page_kaddr,
+					     (const void __user *) src_addr,
+					     PAGE_SIZE);
+			kunmap(page);
+			if (unlikely(err)) {
+				err = -EFAULT;
+				goto out;
+			}
+			goto retry;
+		} else
+			BUG_ON(page);
+
 		if (!err) {
 			dst_addr += PAGE_SIZE;
 			src_addr += PAGE_SIZE;
@@ -248,8 +284,11 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
 			break;
 	}
 
-out:
+out_unlock:
 	up_read(&dst_mm->mmap_sem);
+out:
+	if (page)
+		page_cache_release(page);
 	BUG_ON(copied < 0);
 	BUG_ON(err > 0);
 	BUG_ON(!copied && !err);
-- 
cgit v1.2.3


From ad465cae96b456b48d26c96f27a0577ba443472a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:11 -0700
Subject: userfaultfd: UFFDIO_COPY and UFFDIO_ZEROPAGE

These two ioctl allows to either atomically copy or to map zeropages
into the virtual address space. This is used by the thread that opened
the userfaultfd to resolve the userfaults.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Sanidhya Kashyap <sanidhya.gatech@gmail.com>
Cc: zhang.zhanghailiang@huawei.com
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Huangpeng (Peter)" <peter.huangpeng@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index febbd2b165df..5f11678907d5 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -983,6 +983,96 @@ out:
 	return ret;
 }
 
+static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
+			    unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_copy uffdio_copy;
+	struct uffdio_copy __user *user_uffdio_copy;
+	struct userfaultfd_wake_range range;
+
+	user_uffdio_copy = (struct uffdio_copy __user *) arg;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_copy, user_uffdio_copy,
+			   /* don't copy "copy" last field */
+			   sizeof(uffdio_copy)-sizeof(__s64)))
+		goto out;
+
+	ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+	if (ret)
+		goto out;
+	/*
+	 * double check for wraparound just in case. copy_from_user()
+	 * will later check uffdio_copy.src + uffdio_copy.len to fit
+	 * in the userland range.
+	 */
+	ret = -EINVAL;
+	if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
+		goto out;
+	if (uffdio_copy.mode & ~UFFDIO_COPY_MODE_DONTWAKE)
+		goto out;
+
+	ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+			   uffdio_copy.len);
+	if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+	BUG_ON(!ret);
+	/* len == 0 would wake all */
+	range.len = ret;
+	if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
+		range.start = uffdio_copy.dst;
+		wake_userfault(ctx, &range);
+	}
+	ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
+out:
+	return ret;
+}
+
+static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
+				unsigned long arg)
+{
+	__s64 ret;
+	struct uffdio_zeropage uffdio_zeropage;
+	struct uffdio_zeropage __user *user_uffdio_zeropage;
+	struct userfaultfd_wake_range range;
+
+	user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+
+	ret = -EFAULT;
+	if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
+			   /* don't copy "zeropage" last field */
+			   sizeof(uffdio_zeropage)-sizeof(__s64)))
+		goto out;
+
+	ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+			     uffdio_zeropage.range.len);
+	if (ret)
+		goto out;
+	ret = -EINVAL;
+	if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
+		goto out;
+
+	ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
+			     uffdio_zeropage.range.len);
+	if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+		return -EFAULT;
+	if (ret < 0)
+		goto out;
+	/* len == 0 would wake all */
+	BUG_ON(!ret);
+	range.len = ret;
+	if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
+		range.start = uffdio_zeropage.range.start;
+		wake_userfault(ctx, &range);
+	}
+	ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
+out:
+	return ret;
+}
+
 /*
  * userland asks for a certain API version and we return which bits
  * and ioctl commands are implemented in this kernel for such API
@@ -1038,6 +1128,12 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	case UFFDIO_WAKE:
 		ret = userfaultfd_wake(ctx, arg);
 		break;
+	case UFFDIO_COPY:
+		ret = userfaultfd_copy(ctx, arg);
+		break;
+	case UFFDIO_ZEROPAGE:
+		ret = userfaultfd_zeropage(ctx, arg);
+		break;
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From e6485a47b758cae04a496764a1095961ee3249e4 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:15 -0700
Subject: userfaultfd: require UFFDIO_API before other ioctls

UFFDIO_API was already forced before read/poll could work.  This makes the
code more strict to force it also for all other ioctls.

All users would already have been required to call UFFDIO_API before
invoking other ioctls but this makes it more explicit.

This will ensure we can change all ioctls (all but UFFDIO_API/struct
uffdio_api) with a bump of uffdio_api.api.

There's no actual plan or need to change the API or the ioctl, the current
API already should cover fine even the non cooperative usage, but this is
just for the longer term future just in case.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5f11678907d5..af88ef6fffff 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -577,7 +577,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 
 	if (ctx->state == UFFD_STATE_WAIT_API)
 		return -EINVAL;
-	BUG_ON(ctx->state != UFFD_STATE_RUNNING);
 
 	for (;;) {
 		if (count < sizeof(msg))
@@ -1115,6 +1114,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
 	int ret = -EINVAL;
 	struct userfaultfd_ctx *ctx = file->private_data;
 
+	if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+		return -EINVAL;
+
 	switch(cmd) {
 	case UFFDIO_API:
 		ret = userfaultfd_api(ctx, arg);
-- 
cgit v1.2.3


From dfa37dc3fc1f6f81a6900d0e561c02362f4817f6 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:18 -0700
Subject: userfaultfd: allow signals to interrupt a userfault

This is only simple to achieve if the userfault is going to return to
userland (not to the kernel) because we can avoid returning VM_FAULT_RETRY
despite we temporarily released the mmap_sem.  The fault would just be
retried by userland then.  This is safe at least on x86 and powerpc (the
two archs with the syscall implemented so far).

Hint to verify for which archs this is safe: after handle_mm_fault
returns, no access to data structures protected by the mmap_sem must be
done by the fault code in arch/*/mm/fault.c until up_read(&mm->mmap_sem)
is called.

This has two main benefits: signals can run with lower latency in
production (signals aren't blocked by userfaults and userfaults are
immediately repeated after signal processing) and gdb can then trivially
debug the threads blocked in this kind of userfaults coming directly from
userland.

On a side note: while gdb has a need to get signal processed, coredumps
always worked perfectly with userfaults, no matter if the userfault is
triggered by GUP a kernel copy_user or directly from userland.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index af88ef6fffff..a14d63e945f4 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -262,7 +262,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue uwq;
 	int ret;
-	bool must_wait;
+	bool must_wait, return_to_userland;
 
 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
@@ -327,6 +327,9 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	uwq.msg = userfault_msg(address, flags, reason);
 	uwq.ctx = ctx;
 
+	return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+		(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	/*
 	 * After the __add_wait_queue the uwq is visible to userland
@@ -338,14 +341,16 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 	 * following the spin_unlock to happen before the list_add in
 	 * __add_wait_queue.
 	 */
-	set_current_state(TASK_KILLABLE);
+	set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
+			  TASK_KILLABLE);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 
 	must_wait = userfaultfd_must_wait(ctx, address, flags, reason);
 	up_read(&mm->mmap_sem);
 
 	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
-		   !fatal_signal_pending(current))) {
+		   (return_to_userland ? !signal_pending(current) :
+		    !fatal_signal_pending(current)))) {
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
 		ret |= VM_FAULT_MAJOR;
@@ -353,6 +358,30 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 
 	__set_current_state(TASK_RUNNING);
 
+	if (return_to_userland) {
+		if (signal_pending(current) &&
+		    !fatal_signal_pending(current)) {
+			/*
+			 * If we got a SIGSTOP or SIGCONT and this is
+			 * a normal userland page fault, just let
+			 * userland return so the signal will be
+			 * handled and gdb debugging works.  The page
+			 * fault code immediately after we return from
+			 * this function is going to release the
+			 * mmap_sem and it's not depending on it
+			 * (unlike gup would if we were not to return
+			 * VM_FAULT_RETRY).
+			 *
+			 * If a fatal signal is pending we still take
+			 * the streamlined VM_FAULT_RETRY failure path
+			 * and there's no need to retake the mmap_sem
+			 * in such case.
+			 */
+			down_read(&mm->mmap_sem);
+			ret = 0;
+		}
+	}
+
 	/*
 	 * Here we race with the list_del; list_add in
 	 * userfaultfd_ctx_read(), however because we don't ever run
-- 
cgit v1.2.3


From 230c92a8797e0e717c6732de0fffdd5726c0f48f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:20 -0700
Subject: userfaultfd: propagate the full address in THP faults

The THP faults were not propagating the original fault address.  The
latest version of the API with uffd.arg.pagefault.address is supposed to
propagate the full address through THP faults.

This was not a kernel crashing bug and it wouldn't risk to corrupt user
memory, but it would cause a SIGBUS failure because the wrong page was
being copied.

For various reasons this wasn't easily reproducible in the qemu workload,
but the strestest exposed the problem immediately.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d38aaf9dcba6..279a818a39b1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -717,13 +717,14 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 
 static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					struct vm_area_struct *vma,
-					unsigned long haddr, pmd_t *pmd,
+					unsigned long address, pmd_t *pmd,
 					struct page *page, gfp_t gfp,
 					unsigned int flags)
 {
 	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
 	spinlock_t *ptl;
+	unsigned long haddr = address & HPAGE_PMD_MASK;
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
@@ -765,7 +766,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 			mem_cgroup_cancel_charge(page, memcg);
 			put_page(page);
 			pte_free(mm, pgtable);
-			ret = handle_userfault(vma, haddr, flags,
+			ret = handle_userfault(vma, address, flags,
 					       VM_UFFD_MISSING);
 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 			return ret;
@@ -841,7 +842,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (pmd_none(*pmd)) {
 			if (userfaultfd_missing(vma)) {
 				spin_unlock(ptl);
-				ret = handle_userfault(vma, haddr, flags,
+				ret = handle_userfault(vma, address, flags,
 						       VM_UFFD_MISSING);
 				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 			} else {
@@ -865,7 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
-	return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp, flags);
+	return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
+					    flags);
 }
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-- 
cgit v1.2.3


From 2c5b7e1be74ff0175dedbbd325abe9f0dbbb09ae Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:23 -0700
Subject: userfaultfd: avoid missing wakeups during refile in userfaultfd_read

During the refile in userfaultfd_read both waitqueues could look empty to
the lockless wake_userfault().  Use a seqcount to prevent this false
negative that could leave an userfault blocked.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index a14d63e945f4..634e676072cb 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -45,6 +45,8 @@ struct userfaultfd_ctx {
 	wait_queue_head_t fault_wqh;
 	/* waitqueue head for the pseudo fd to wakeup poll/read */
 	wait_queue_head_t fd_wqh;
+	/* a refile sequence protected by fault_pending_wqh lock */
+	struct seqcount refile_seq;
 	/* pseudo fd refcounting */
 	atomic_t refcount;
 	/* userfaultfd syscall flags */
@@ -546,6 +548,15 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 		spin_lock(&ctx->fault_pending_wqh.lock);
 		uwq = find_userfault(ctx);
 		if (uwq) {
+			/*
+			 * Use a seqcount to repeat the lockless check
+			 * in wake_userfault() to avoid missing
+			 * wakeups because during the refile both
+			 * waitqueue could become empty if this is the
+			 * only userfault.
+			 */
+			write_seqcount_begin(&ctx->refile_seq);
+
 			/*
 			 * The fault_pending_wqh.lock prevents the uwq
 			 * to disappear from under us.
@@ -570,6 +581,8 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			list_del(&uwq->wq.task_list);
 			__add_wait_queue(&ctx->fault_wqh, &uwq->wq);
 
+			write_seqcount_end(&ctx->refile_seq);
+
 			/* careful to always initialize msg if ret == 0 */
 			*msg = uwq->msg;
 			spin_unlock(&ctx->fault_pending_wqh.lock);
@@ -647,6 +660,9 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
 static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
 					   struct userfaultfd_wake_range *range)
 {
+	unsigned seq;
+	bool need_wakeup;
+
 	/*
 	 * To be sure waitqueue_active() is not reordered by the CPU
 	 * before the pagetable update, use an explicit SMP memory
@@ -662,8 +678,13 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
 	 * userfaults yet. So we take the spinlock only when we're
 	 * sure we've userfaults to wake.
 	 */
-	if (waitqueue_active(&ctx->fault_pending_wqh) ||
-	    waitqueue_active(&ctx->fault_wqh))
+	do {
+		seq = read_seqcount_begin(&ctx->refile_seq);
+		need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
+			waitqueue_active(&ctx->fault_wqh);
+		cond_resched();
+	} while (read_seqcount_retry(&ctx->refile_seq, seq));
+	if (need_wakeup)
 		__wake_userfault(ctx, range);
 }
 
@@ -1219,6 +1240,7 @@ static void init_once_userfaultfd_ctx(void *mem)
 	init_waitqueue_head(&ctx->fault_pending_wqh);
 	init_waitqueue_head(&ctx->fault_wqh);
 	init_waitqueue_head(&ctx->fd_wqh);
+	seqcount_init(&ctx->refile_seq);
 }
 
 /**
-- 
cgit v1.2.3


From c47174fc362a089b1125174258e53ef4a69ce6b8 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 4 Sep 2015 15:47:23 -0700
Subject: userfaultfd: selftest

This test allocates two virtual areas and bounces the physical memory
across the two virtual areas using only userfaultfd.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Shuah Khan <shuah.kh@samsung.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/vm/Makefile      |   3 +
 tools/testing/selftests/vm/run_vmtests   |  11 +
 tools/testing/selftests/vm/userfaultfd.c | 636 +++++++++++++++++++++++++++++++
 3 files changed, 650 insertions(+)
 create mode 100644 tools/testing/selftests/vm/userfaultfd.c

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 231b9a031f6a..0d6854744b37 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -8,10 +8,13 @@ BINARIES += hugetlbfstest
 BINARIES += map_hugetlb
 BINARIES += thuge-gen
 BINARIES += transhuge-stress
+BINARIES += userfaultfd
 
 all: $(BINARIES)
 %: %.c
 	$(CC) $(CFLAGS) -o $@ $^ -lrt
+userfaultfd: userfaultfd.c
+	$(CC) $(CFLAGS) -O2 -o $@ $^ -lpthread
 
 TEST_PROGS := run_vmtests
 TEST_FILES := $(BINARIES)
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 49ece11ff7fd..831adeb5fc55 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -86,6 +86,17 @@ else
 	echo "[PASS]"
 fi
 
+echo "--------------------"
+echo "running userfaultfd"
+echo "--------------------"
+./userfaultfd 128 32
+if [ $? -ne 0 ]; then
+	echo "[FAIL]"
+	exitcode=1
+else
+	echo "[PASS]"
+fi
+
 #cleanup
 umount $mnt
 rm -rf $mnt
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000000..0c0b83953352
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,636 @@
+/*
+ * Stress userfaultfd syscall.
+ *
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ *    page of the area_dst (while the physical page may still be in
+ *    area_src), and increments a per-page counter in the same page,
+ *    and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ *    thread 1 above. userfaultfd blocking reads or poll() modes are
+ *    exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ *    at maximum bandwidth (if not already transferred by thread
+ *    2). Each cpu thread takes cares of transferring a portion of the
+ *    area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ *
+ * The program takes two parameters: the amounts of physical memory in
+ * megabytes (MiB) of the area and the number of bounces to execute.
+ *
+ * # 100MiB 99999 bounces
+ * ./userfaultfd 100 99999
+ *
+ * # 1GiB 99 bounces
+ * ./userfaultfd 1000 99
+ *
+ * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
+ * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include "../../../../include/uapi/linux/userfaultfd.h"
+
+#ifdef __x86_64__
+#define __NR_userfaultfd 323
+#elif defined(__i386__)
+#define __NR_userfaultfd 359
+#elif defined(__powewrpc__)
+#define __NR_userfaultfd 364
+#else
+#error "missing __NR_userfaultfd definition"
+#endif
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM		(1<<0)
+#define BOUNCE_RACINGFAULTS	(1<<1)
+#define BOUNCE_VERIFY		(1<<2)
+#define BOUNCE_POLL		(1<<3)
+static int bounces;
+
+static unsigned long long *count_verify;
+static int uffd, finished, *pipefd;
+static char *area_src, *area_dst;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr)					\
+	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr)					\
+	((volatile unsigned long long *) ((unsigned long)		\
+				 ((___area) + (___nr)*page_size +	\
+				  sizeof(pthread_mutex_t) +		\
+				  sizeof(unsigned long long) - 1) &	\
+				 ~(unsigned long)(sizeof(unsigned long long) \
+						  -  1)))
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+	unsigned long i;
+	for (i = 0; i < n; i++)
+		if (str1[i] != str2[i])
+			return 1;
+	return 0;
+}
+
+static void *locking_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	struct random_data rand;
+	unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+	int32_t rand_nr;
+	unsigned long long count;
+	char randstate[64];
+	unsigned int seed;
+	time_t start;
+
+	if (bounces & BOUNCE_RANDOM) {
+		seed = (unsigned int) time(NULL) - bounces;
+		if (!(bounces & BOUNCE_RACINGFAULTS))
+			seed += cpu;
+		bzero(&rand, sizeof(rand));
+		bzero(&randstate, sizeof(randstate));
+		if (initstate_r(seed, randstate, sizeof(randstate), &rand))
+			fprintf(stderr, "srandom_r error\n"), exit(1);
+	} else {
+		page_nr = -bounces;
+		if (!(bounces & BOUNCE_RACINGFAULTS))
+			page_nr += cpu * nr_pages_per_cpu;
+	}
+
+	while (!finished) {
+		if (bounces & BOUNCE_RANDOM) {
+			if (random_r(&rand, &rand_nr))
+				fprintf(stderr, "random_r 1 error\n"), exit(1);
+			page_nr = rand_nr;
+			if (sizeof(page_nr) > sizeof(rand_nr)) {
+				if (random_r(&rand, &rand_nr))
+					fprintf(stderr, "random_r 2 error\n"), exit(1);
+				page_nr |= ((unsigned long) rand_nr) << 32;
+			}
+		} else
+			page_nr += 1;
+		page_nr %= nr_pages;
+
+		start = time(NULL);
+		if (bounces & BOUNCE_VERIFY) {
+			count = *area_count(area_dst, page_nr);
+			if (!count)
+				fprintf(stderr,
+					"page_nr %lu wrong count %Lu %Lu\n",
+					page_nr, count,
+					count_verify[page_nr]), exit(1);
+
+
+			/*
+			 * We can't use bcmp (or memcmp) because that
+			 * returns 0 erroneously if the memory is
+			 * changing under it (even if the end of the
+			 * page is never changing and always
+			 * different).
+			 */
+#if 1
+			if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+				     page_size))
+				fprintf(stderr,
+					"my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+					page_nr, count,
+					count_verify[page_nr]), exit(1);
+#else
+			unsigned long loops;
+
+			loops = 0;
+			/* uncomment the below line to test with mutex */
+			/* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+			while (!bcmp(area_dst + page_nr * page_size, zeropage,
+				     page_size)) {
+				loops += 1;
+				if (loops > 10)
+					break;
+			}
+			/* uncomment below line to test with mutex */
+			/* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+			if (loops) {
+				fprintf(stderr,
+					"page_nr %lu all zero thread %lu %p %lu\n",
+					page_nr, cpu, area_dst + page_nr * page_size,
+					loops);
+				if (loops > 10)
+					exit(1);
+			}
+#endif
+		}
+
+		pthread_mutex_lock(area_mutex(area_dst, page_nr));
+		count = *area_count(area_dst, page_nr);
+		if (count != count_verify[page_nr]) {
+			fprintf(stderr,
+				"page_nr %lu memory corruption %Lu %Lu\n",
+				page_nr, count,
+				count_verify[page_nr]), exit(1);
+		}
+		count++;
+		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+		if (time(NULL) - start > 1)
+			fprintf(stderr,
+				"userfault too slow %ld "
+				"possible false positive with overcommit\n",
+				time(NULL) - start);
+	}
+
+	return NULL;
+}
+
+static int copy_page(unsigned long offset)
+{
+	struct uffdio_copy uffdio_copy;
+
+	if (offset >= nr_pages * page_size)
+		fprintf(stderr, "unexpected offset %lu\n",
+			offset), exit(1);
+	uffdio_copy.dst = (unsigned long) area_dst + offset;
+	uffdio_copy.src = (unsigned long) area_src + offset;
+	uffdio_copy.len = page_size;
+	uffdio_copy.mode = 0;
+	uffdio_copy.copy = 0;
+	if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy)) {
+		/* real retval in ufdio_copy.copy */
+		if (uffdio_copy.copy != -EEXIST)
+			fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+				uffdio_copy.copy), exit(1);
+	} else if (uffdio_copy.copy != page_size) {
+		fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+			uffdio_copy.copy), exit(1);
+	} else
+		return 1;
+	return 0;
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	struct pollfd pollfd[2];
+	struct uffd_msg msg;
+	int ret;
+	unsigned long offset;
+	char tmp_chr;
+	unsigned long userfaults = 0;
+
+	pollfd[0].fd = uffd;
+	pollfd[0].events = POLLIN;
+	pollfd[1].fd = pipefd[cpu*2];
+	pollfd[1].events = POLLIN;
+
+	for (;;) {
+		ret = poll(pollfd, 2, -1);
+		if (!ret)
+			fprintf(stderr, "poll error %d\n", ret), exit(1);
+		if (ret < 0)
+			perror("poll"), exit(1);
+		if (pollfd[1].revents & POLLIN) {
+			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
+				fprintf(stderr, "read pipefd error\n"),
+					exit(1);
+			break;
+		}
+		if (!(pollfd[0].revents & POLLIN))
+			fprintf(stderr, "pollfd[0].revents %d\n",
+				pollfd[0].revents), exit(1);
+		ret = read(uffd, &msg, sizeof(msg));
+		if (ret < 0) {
+			if (errno == EAGAIN)
+				continue;
+			perror("nonblocking read error"), exit(1);
+		}
+		if (msg.event != UFFD_EVENT_PAGEFAULT)
+			fprintf(stderr, "unexpected msg event %u\n",
+				msg.event), exit(1);
+		if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+			fprintf(stderr, "unexpected write fault\n"), exit(1);
+		offset = (char *)msg.arg.pagefault.address - area_dst;
+		offset &= ~(page_size-1);
+		if (copy_page(offset))
+			userfaults++;
+	}
+	return (void *)userfaults;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+	unsigned long *this_cpu_userfaults;
+	struct uffd_msg msg;
+	unsigned long offset;
+	int ret;
+
+	this_cpu_userfaults = (unsigned long *) arg;
+	*this_cpu_userfaults = 0;
+
+	pthread_mutex_unlock(&uffd_read_mutex);
+	/* from here cancellation is ok */
+
+	for (;;) {
+		ret = read(uffd, &msg, sizeof(msg));
+		if (ret != sizeof(msg)) {
+			if (ret < 0)
+				perror("blocking read error"), exit(1);
+			else
+				fprintf(stderr, "short read\n"), exit(1);
+		}
+		if (msg.event != UFFD_EVENT_PAGEFAULT)
+			fprintf(stderr, "unexpected msg event %u\n",
+				msg.event), exit(1);
+		if (bounces & BOUNCE_VERIFY &&
+		    msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
+			fprintf(stderr, "unexpected write fault\n"), exit(1);
+		offset = (char *)msg.arg.pagefault.address - area_dst;
+		offset &= ~(page_size-1);
+		if (copy_page(offset))
+			(*this_cpu_userfaults)++;
+	}
+	return (void *)NULL;
+}
+
+static void *background_thread(void *arg)
+{
+	unsigned long cpu = (unsigned long) arg;
+	unsigned long page_nr;
+
+	for (page_nr = cpu * nr_pages_per_cpu;
+	     page_nr < (cpu+1) * nr_pages_per_cpu;
+	     page_nr++)
+		copy_page(page_nr * page_size);
+
+	return NULL;
+}
+
+static int stress(unsigned long *userfaults)
+{
+	unsigned long cpu;
+	pthread_t locking_threads[nr_cpus];
+	pthread_t uffd_threads[nr_cpus];
+	pthread_t background_threads[nr_cpus];
+	void **_userfaults = (void **) userfaults;
+
+	finished = 0;
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		if (pthread_create(&locking_threads[cpu], &attr,
+				   locking_thread, (void *)cpu))
+			return 1;
+		if (bounces & BOUNCE_POLL) {
+			if (pthread_create(&uffd_threads[cpu], &attr,
+					   uffd_poll_thread, (void *)cpu))
+				return 1;
+		} else {
+			if (pthread_create(&uffd_threads[cpu], &attr,
+					   uffd_read_thread,
+					   &_userfaults[cpu]))
+				return 1;
+			pthread_mutex_lock(&uffd_read_mutex);
+		}
+		if (pthread_create(&background_threads[cpu], &attr,
+				   background_thread, (void *)cpu))
+			return 1;
+	}
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pthread_join(background_threads[cpu], NULL))
+			return 1;
+
+	/*
+	 * Be strict and immediately zap area_src, the whole area has
+	 * been transferred already by the background treads. The
+	 * area_src could then be faulted in in a racy way by still
+	 * running uffdio_threads reading zeropages after we zapped
+	 * area_src (but they're guaranteed to get -EEXIST from
+	 * UFFDIO_COPY without writing zero pages into area_dst
+	 * because the background threads already completed).
+	 */
+	if (madvise(area_src, nr_pages * page_size, MADV_DONTNEED)) {
+		perror("madvise");
+		return 1;
+	}
+
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		char c;
+		if (bounces & BOUNCE_POLL) {
+			if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+				fprintf(stderr, "pipefd write error\n");
+				return 1;
+			}
+			if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
+				return 1;
+		} else {
+			if (pthread_cancel(uffd_threads[cpu]))
+				return 1;
+			if (pthread_join(uffd_threads[cpu], NULL))
+				return 1;
+		}
+	}
+
+	finished = 1;
+	for (cpu = 0; cpu < nr_cpus; cpu++)
+		if (pthread_join(locking_threads[cpu], NULL))
+			return 1;
+
+	return 0;
+}
+
+static int userfaultfd_stress(void)
+{
+	void *area;
+	char *tmp_area;
+	unsigned long nr;
+	struct uffdio_register uffdio_register;
+	struct uffdio_api uffdio_api;
+	unsigned long cpu;
+	int uffd_flags;
+	unsigned long userfaults[nr_cpus];
+
+	if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+		fprintf(stderr, "out of memory\n");
+		return 1;
+	}
+	area_src = area;
+	if (posix_memalign(&area, page_size, nr_pages * page_size)) {
+		fprintf(stderr, "out of memory\n");
+		return 1;
+	}
+	area_dst = area;
+
+	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+	if (uffd < 0) {
+		fprintf(stderr,
+			"userfaultfd syscall not available in this kernel\n");
+		return 1;
+	}
+	uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+	uffdio_api.api = UFFD_API;
+	uffdio_api.features = 0;
+	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+		fprintf(stderr, "UFFDIO_API\n");
+		return 1;
+	}
+	if (uffdio_api.api != UFFD_API) {
+		fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+		return 1;
+	}
+
+	count_verify = malloc(nr_pages * sizeof(unsigned long long));
+	if (!count_verify) {
+		perror("count_verify");
+		return 1;
+	}
+
+	for (nr = 0; nr < nr_pages; nr++) {
+		*area_mutex(area_src, nr) = (pthread_mutex_t)
+			PTHREAD_MUTEX_INITIALIZER;
+		count_verify[nr] = *area_count(area_src, nr) = 1;
+	}
+
+	pipefd = malloc(sizeof(int) * nr_cpus * 2);
+	if (!pipefd) {
+		perror("pipefd");
+		return 1;
+	}
+	for (cpu = 0; cpu < nr_cpus; cpu++) {
+		if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+			perror("pipe");
+			return 1;
+		}
+	}
+
+	if (posix_memalign(&area, page_size, page_size)) {
+		fprintf(stderr, "out of memory\n");
+		return 1;
+	}
+	zeropage = area;
+	bzero(zeropage, page_size);
+
+	pthread_mutex_lock(&uffd_read_mutex);
+
+	pthread_attr_init(&attr);
+	pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+	while (bounces--) {
+		unsigned long expected_ioctls;
+
+		printf("bounces: %d, mode:", bounces);
+		if (bounces & BOUNCE_RANDOM)
+			printf(" rnd");
+		if (bounces & BOUNCE_RACINGFAULTS)
+			printf(" racing");
+		if (bounces & BOUNCE_VERIFY)
+			printf(" ver");
+		if (bounces & BOUNCE_POLL)
+			printf(" poll");
+		printf(", ");
+		fflush(stdout);
+
+		if (bounces & BOUNCE_POLL)
+			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+		else
+			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+		/* register */
+		uffdio_register.range.start = (unsigned long) area_dst;
+		uffdio_register.range.len = nr_pages * page_size;
+		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+			fprintf(stderr, "register failure\n");
+			return 1;
+		}
+		expected_ioctls = (1 << _UFFDIO_WAKE) |
+				  (1 << _UFFDIO_COPY) |
+				  (1 << _UFFDIO_ZEROPAGE);
+		if ((uffdio_register.ioctls & expected_ioctls) !=
+		    expected_ioctls) {
+			fprintf(stderr,
+				"unexpected missing ioctl for anon memory\n");
+			return 1;
+		}
+
+		/*
+		 * The madvise done previously isn't enough: some
+		 * uffd_thread could have read userfaults (one of
+		 * those already resolved by the background thread)
+		 * and it may be in the process of calling
+		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+		 * area_src and it would map a zero page in it (of
+		 * course such a UFFDIO_COPY is perfectly safe as it'd
+		 * return -EEXIST). The problem comes at the next
+		 * bounce though: that racing UFFDIO_COPY would
+		 * generate zeropages in the area_src, so invalidating
+		 * the previous MADV_DONTNEED. Without this additional
+		 * MADV_DONTNEED those zeropages leftovers in the
+		 * area_src would lead to -EEXIST failure during the
+		 * next bounce, effectively leaving a zeropage in the
+		 * area_dst.
+		 *
+		 * Try to comment this out madvise to see the memory
+		 * corruption being caught pretty quick.
+		 *
+		 * khugepaged is also inhibited to collapse THP after
+		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+		 * required to MADV_DONTNEED here.
+		 */
+		if (madvise(area_dst, nr_pages * page_size, MADV_DONTNEED)) {
+			perror("madvise 2");
+			return 1;
+		}
+
+		/* bounce pass */
+		if (stress(userfaults))
+			return 1;
+
+		/* unregister */
+		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+			fprintf(stderr, "register failure\n");
+			return 1;
+		}
+
+		/* verification */
+		if (bounces & BOUNCE_VERIFY) {
+			for (nr = 0; nr < nr_pages; nr++) {
+				if (my_bcmp(area_dst,
+					    area_dst + nr * page_size,
+					    sizeof(pthread_mutex_t))) {
+					fprintf(stderr,
+						"error mutex 2 %lu\n",
+						nr);
+					bounces = 0;
+				}
+				if (*area_count(area_dst, nr) != count_verify[nr]) {
+					fprintf(stderr,
+						"error area_count %Lu %Lu %lu\n",
+						*area_count(area_src, nr),
+						count_verify[nr],
+						nr);
+					bounces = 0;
+				}
+			}
+		}
+
+		/* prepare next bounce */
+		tmp_area = area_src;
+		area_src = area_dst;
+		area_dst = tmp_area;
+
+		printf("userfaults:");
+		for (cpu = 0; cpu < nr_cpus; cpu++)
+			printf(" %lu", userfaults[cpu]);
+		printf("\n");
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	if (argc < 3)
+		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	page_size = sysconf(_SC_PAGE_SIZE);
+	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) >
+	    page_size)
+		fprintf(stderr, "Impossible to run this test\n"), exit(2);
+	nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
+		nr_cpus;
+	if (!nr_pages_per_cpu) {
+		fprintf(stderr, "invalid MiB\n");
+		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+	}
+	bounces = atoi(argv[2]);
+	if (bounces <= 0) {
+		fprintf(stderr, "invalid bounces\n");
+		fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
+	}
+	nr_pages = nr_pages_per_cpu * nr_cpus;
+	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+	       nr_pages, nr_pages_per_cpu);
+	return userfaultfd_stress();
+}
-- 
cgit v1.2.3


From 5b74283ab251b9db55cbbe31d19ca72482103290 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:29 -0700
Subject: x86, mm: trace when an IPI is about to be sent

When unmapping pages it is necessary to flush the TLB.  If that page was
accessed by another CPU then an IPI is used to flush the remote CPU.  That
is a lot of IPIs if kswapd is scanning and unmapping >100K pages per
second.

There already is a window between when a page is unmapped and when it is
TLB flushed.  This series increases the window so multiple pages can be
flushed using a single IPI.  This should be safe or the kernel is hosed
already.

Patch 1 simply made the rest of the series easier to write as ftrace
        could identify all the senders of TLB flush IPIS.

Patch 2 tracks what CPUs potentially map a PFN and then sends an IPI
        to flush the entire TLB.

Patch 3 tracks when there potentially are writable TLB entries that
        need to be batched differently

Patch 4 increases SWAP_CLUSTER_MAX to further batch flushes

The performance impact is documented in the changelogs but in the optimistic
case on a 4-socket machine the full series reduces interrupts from 900K
interrupts/second to 60K interrupts/second.

This patch (of 4):

It is easy to trace when an IPI is received to flush a TLB but harder to
detect what event sent it.  This patch makes it easy to identify the
source of IPIs being transmitted for TLB flushes on x86.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/tlb.c          | 1 +
 include/linux/mm_types.h   | 1 +
 include/trace/events/tlb.h | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90b924acd982..8ddb5d0d66fb 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	info.flush_end = end;
 
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
+	trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start);
 	if (is_uv_system()) {
 		unsigned int cpu;
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 26a30c3566f0..c8d0a73d64c4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -554,6 +554,7 @@ enum tlb_flush_reason {
 	TLB_REMOTE_SHOOTDOWN,
 	TLB_LOCAL_SHOOTDOWN,
 	TLB_LOCAL_MM_SHOOTDOWN,
+	TLB_REMOTE_SEND_IPI,
 	NR_TLB_FLUSH_REASONS,
 };
 
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
index 4250f364a6ca..bc8815f45f3b 100644
--- a/include/trace/events/tlb.h
+++ b/include/trace/events/tlb.h
@@ -11,7 +11,8 @@
 	EM(  TLB_FLUSH_ON_TASK_SWITCH,	"flush on task switch" )	\
 	EM(  TLB_REMOTE_SHOOTDOWN,	"remote shootdown" )		\
 	EM(  TLB_LOCAL_SHOOTDOWN,	"local shootdown" )		\
-	EMe( TLB_LOCAL_MM_SHOOTDOWN,	"local mm shootdown" )
+	EM(  TLB_LOCAL_MM_SHOOTDOWN,	"local mm shootdown" )		\
+	EMe( TLB_REMOTE_SEND_IPI,	"remote ipi send" )
 
 /*
  * First define the enums in TLB_FLUSH_REASON to be exported to userspace
-- 
cgit v1.2.3


From 72b252aed506b8f1a03f7abd29caef4cdf6a043b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:32 -0700
Subject: mm: send one IPI per CPU to TLB flush all entries after unmapping
 pages

An IPI is sent to flush remote TLBs when a page is unmapped that was
potentially accesssed by other CPUs.  There are many circumstances where
this happens but the obvious one is kswapd reclaiming pages belonging to a
running process as kswapd and the task are likely running on separate
CPUs.

On small machines, this is not a significant problem but as machine gets
larger with more cores and more memory, the cost of these IPIs can be
high.  This patch uses a simple structure that tracks CPUs that
potentially have TLB entries for pages being unmapped.  When the unmapping
is complete, the full TLB is flushed on the assumption that a refill cost
is lower than flushing individual entries.

Architectures wishing to do this must give the following guarantee.

        If a clean page is unmapped and not immediately flushed, the
        architecture must guarantee that a write to that linear address
        from a CPU with a cached TLB entry will trap a page fault.

This is essentially what the kernel already depends on but the window is
much larger with this patch applied and is worth highlighting.  The
architecture should consider whether the cost of the full TLB flush is
higher than sending an IPI to flush each individual entry.  An additional
architecture helper called flush_tlb_local is required.  It's a trivial
wrapper with some accounting in the x86 case.

The impact of this patch depends on the workload as measuring any benefit
requires both mapped pages co-located on the LRU and memory pressure.  The
case with the biggest impact is multiple processes reading mapped pages
taken from the vm-scalability test suite.  The test case uses NR_CPU
readers of mapped files that consume 10*RAM.

Linear mapped reader on a 4-node machine with 64G RAM and 48 CPUs

                                           4.2.0-rc1          4.2.0-rc1
                                             vanilla       flushfull-v7
Ops lru-file-mmap-read-elapsed      159.62 (  0.00%)   120.68 ( 24.40%)
Ops lru-file-mmap-read-time_range    30.59 (  0.00%)     2.80 ( 90.85%)
Ops lru-file-mmap-read-time_stddv     6.70 (  0.00%)     0.64 ( 90.38%)

           4.2.0-rc1    4.2.0-rc1
             vanilla flushfull-v7
User          581.00       611.43
System       5804.93      4111.76
Elapsed       161.03       122.12

This is showing that the readers completed 24.40% faster with 29% less
system CPU time.  From vmstats, it is known that the vanilla kernel was
interrupted roughly 900K times per second during the steady phase of the
test and the patched kernel was interrupts 180K times per second.

The impact is lower on a single socket machine.

                                           4.2.0-rc1          4.2.0-rc1
                                             vanilla       flushfull-v7
Ops lru-file-mmap-read-elapsed       25.33 (  0.00%)    20.38 ( 19.54%)
Ops lru-file-mmap-read-time_range     0.91 (  0.00%)     1.44 (-58.24%)
Ops lru-file-mmap-read-time_stddv     0.28 (  0.00%)     0.47 (-65.34%)

           4.2.0-rc1    4.2.0-rc1
             vanilla flushfull-v7
User           58.09        57.64
System        111.82        76.56
Elapsed        27.29        22.55

It's still a noticeable improvement with vmstat showing interrupts went
from roughly 500K per second to 45K per second.

The patch will have no impact on workloads with no memory pressure or have
relatively few mapped pages.  It will have an unpredictable impact on the
workload running on the CPU being flushed as it'll depend on how many TLB
entries need to be refilled and how long that takes.  Worst case, the TLB
will be completely cleared of active entries when the target PFNs were not
resident at all.

[sasha.levin@oracle.com: trace tlb flush after disabling preemption in try_to_unmap_flush]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                |   1 +
 arch/x86/include/asm/tlbflush.h |   6 +++
 include/linux/rmap.h            |   3 ++
 include/linux/sched.h           |  16 +++++++
 init/Kconfig                    |  10 ++++
 mm/internal.h                   |  11 +++++
 mm/rmap.c                       | 104 +++++++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                     |  23 ++++++++-
 8 files changed, 172 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 48f7433dac6f..117e2f373e50 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -41,6 +41,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd791948b286..6df2029405a3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void)
 
 #endif	/* SMP */
 
+/* Not inlined due to inc_irq_stat not being defined yet */
+#define flush_tlb_local() {		\
+	inc_irq_stat(irq_tlb_count);	\
+	local_flush_tlb();		\
+}
+
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)	\
 	native_flush_tlb_others(mask, mm, start, end)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index c89c53a113a8..29446aeef36e 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -89,6 +89,9 @@ enum ttu_flags {
 	TTU_IGNORE_MLOCK = (1 << 8),	/* ignore mlock */
 	TTU_IGNORE_ACCESS = (1 << 9),	/* don't age */
 	TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */
+	TTU_BATCH_FLUSH = (1 << 11),	/* Batch TLB flushes where possible
+					 * and caller guarantees they will
+					 * do a final flush if necessary */
 };
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 119823decc46..3c602c20c717 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1344,6 +1344,18 @@ enum perf_event_task_context {
 	perf_nr_task_contexts,
 };
 
+/* Track pages that require TLB flushes */
+struct tlbflush_unmap_batch {
+	/*
+	 * Each bit set is a CPU that potentially has a TLB entry for one of
+	 * the PFNs being flushed. See set_tlb_ubc_flush_pending().
+	 */
+	struct cpumask cpumask;
+
+	/* True if any bit in cpumask is set */
+	bool flush_required;
+};
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
@@ -1700,6 +1712,10 @@ struct task_struct {
 	unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	struct tlbflush_unmap_batch tlb_ubc;
+#endif
+
 	struct rcu_head rcu;
 
 	/*
diff --git a/init/Kconfig b/init/Kconfig
index 161acd8bc56f..cf7e4824c8d0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -882,6 +882,16 @@ config GENERIC_SCHED_CLOCK
 config ARCH_SUPPORTS_NUMA_BALANCING
 	bool
 
+#
+# For architectures that prefer to flush all TLBs after a number of pages
+# are unmapped instead of sending one IPI per page to flush. The architecture
+# must provide guarantees on what happens if a clean TLB cache entry is
+# written after the unmap. Details are in mm/rmap.c near the check for
+# should_defer_flush. The architecture should also consider if the full flush
+# and the refill costs are offset by the savings of sending fewer IPIs.
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	bool
+
 #
 # For architectures that know their GCC __int128 support is sound
 #
diff --git a/mm/internal.h b/mm/internal.h
index 36b23f1e2ca6..bd6372ac5f7f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
 #define ALLOC_CMA		0x80 /* allow allocations from CMA areas */
 #define ALLOC_FAIR		0x100 /* fair zone allocation */
 
+enum ttu_flags;
+struct tlbflush_unmap_batch;
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+void try_to_unmap_flush(void);
+#else
+static inline void try_to_unmap_flush(void)
+{
+}
+
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 171b68768df1..326d5d89e45c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,8 @@
 
 #include <asm/tlbflush.h>
 
+#include <trace/events/tlb.h>
+
 #include "internal.h"
 
 static struct kmem_cache *anon_vma_cachep;
@@ -583,6 +585,89 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 	return address;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+	/*
+	 * All TLB entries are flushed on the assumption that it is
+	 * cheaper to flush all TLBs and let them be refilled than
+	 * flushing individual PFNs. Note that we do not track mm's
+	 * to flush as that might simply be multiple full TLB flushes
+	 * for no gain.
+	 */
+	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+	flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+	int cpu;
+
+	if (!tlb_ubc->flush_required)
+		return;
+
+	cpu = get_cpu();
+
+	trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+	if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+		percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+	if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+		smp_call_function_many(&tlb_ubc->cpumask,
+			percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+	}
+	cpumask_clear(&tlb_ubc->cpumask);
+	tlb_ubc->flush_required = false;
+	put_cpu();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+		struct page *page)
+{
+	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+	tlb_ubc->flush_required = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+	bool should_defer = false;
+
+	if (!(flags & TTU_BATCH_FLUSH))
+		return false;
+
+	/* If remote CPUs need to be flushed then defer batch the flush */
+	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+		should_defer = true;
+	put_cpu();
+
+	return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+		struct page *page)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+	return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * At what user virtual address is page expected in vma?
  * Caller should check the page is actually part of the vma.
@@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	/* Nuke the page table entry. */
 	flush_cache_page(vma, address, page_to_pfn(page));
-	pteval = ptep_clear_flush(vma, address, pte);
+	if (should_defer_flush(mm, flags)) {
+		/*
+		 * We clear the PTE but do not flush so potentially a remote
+		 * CPU could still be writing to the page. If the entry was
+		 * previously clean then the architecture must guarantee that
+		 * a clear->dirty transition on a cached TLB entry is written
+		 * through and traps if the PTE is unmapped.
+		 */
+		pteval = ptep_get_and_clear(mm, address, pte);
+
+		/* Potentially writable TLBs must be flushed before IO */
+		if (pte_dirty(pteval))
+			flush_tlb_page(vma, address);
+		else
+			set_tlb_ubc_flush_pending(mm, page);
+	} else {
+		pteval = ptep_clear_flush(vma, address, pte);
+	}
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8286938c70de..99ec00d6a5dd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			switch (try_to_unmap(page, ttu_flags)) {
+			switch (try_to_unmap(page,
+					ttu_flags|TTU_BATCH_FLUSH)) {
 			case SWAP_FAIL:
 				goto activate_locked;
 			case SWAP_AGAIN:
@@ -1208,6 +1209,7 @@ keep:
 	}
 
 	mem_cgroup_uncharge_list(&free_pages);
+	try_to_unmap_flush();
 	free_hot_cold_page_list(&free_pages, true);
 
 	list_splice(&ret_pages, page_list);
@@ -2151,6 +2153,23 @@ out:
 	}
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void init_tlb_ubc(void)
+{
+	/*
+	 * This deliberately does not clear the cpumask as it's expensive
+	 * and unnecessary. If there happens to be data in there then the
+	 * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
+	 * then will be cleared.
+	 */
+	current->tlb_ubc.flush_required = false;
+}
+#else
+static inline void init_tlb_ubc(void)
+{
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
 	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
 			 sc->priority == DEF_PRIORITY);
 
+	init_tlb_ubc();
+
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
-- 
cgit v1.2.3


From d950c9477d51f0cefc2ed3cf76e695d46af0d9c1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:35 -0700
Subject: mm: defer flush of writable TLB entries

If a PTE is unmapped and it's dirty then it was writable recently.  Due to
deferred TLB flushing, it's best to assume a writable TLB cache entry
exists.  With that assumption, the TLB must be flushed before any IO can
start or the page is freed to avoid lost writes or data corruption.  This
patch defers flushing of potentially writable TLBs as long as possible.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  7 +++++++
 mm/internal.h         |  4 ++++
 mm/rmap.c             | 28 +++++++++++++++++++++-------
 mm/vmscan.c           |  7 ++++++-
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c602c20c717..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1354,6 +1354,13 @@ struct tlbflush_unmap_batch {
 
 	/* True if any bit in cpumask is set */
 	bool flush_required;
+
+	/*
+	 * If true then the PTE was dirty when unmapped. The entry must be
+	 * flushed before IO is initiated or a stale TLB entry potentially
+	 * allows an update without redirtying the page.
+	 */
+	bool writable;
 };
 
 struct task_struct {
diff --git a/mm/internal.h b/mm/internal.h
index bd6372ac5f7f..1195dd2d6a2b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -431,10 +431,14 @@ struct tlbflush_unmap_batch;
 
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
+void try_to_unmap_flush_dirty(void);
 #else
 static inline void try_to_unmap_flush(void)
 {
 }
+static inline void try_to_unmap_flush_dirty(void)
+{
+}
 
 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/rmap.c b/mm/rmap.c
index 326d5d89e45c..0db38e7d0a72 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -626,16 +626,34 @@ void try_to_unmap_flush(void)
 	}
 	cpumask_clear(&tlb_ubc->cpumask);
 	tlb_ubc->flush_required = false;
+	tlb_ubc->writable = false;
 	put_cpu();
 }
 
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+	if (tlb_ubc->writable)
+		try_to_unmap_flush();
+}
+
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
-		struct page *page)
+		struct page *page, bool writable)
 {
 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 
 	cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
 	tlb_ubc->flush_required = true;
+
+	/*
+	 * If the PTE was dirty then it's best to assume it's writable. The
+	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+	 * before the page is queued for IO.
+	 */
+	if (writable)
+		tlb_ubc->writable = true;
 }
 
 /*
@@ -658,7 +676,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
 }
 #else
 static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
-		struct page *page)
+		struct page *page, bool writable)
 {
 }
 
@@ -1315,11 +1333,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		 */
 		pteval = ptep_get_and_clear(mm, address, pte);
 
-		/* Potentially writable TLBs must be flushed before IO */
-		if (pte_dirty(pteval))
-			flush_tlb_page(vma, address);
-		else
-			set_tlb_ubc_flush_pending(mm, page);
+		set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
 	} else {
 		pteval = ptep_clear_flush(vma, address, pte);
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99ec00d6a5dd..b1139039122a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1098,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			if (!sc->may_writepage)
 				goto keep_locked;
 
-			/* Page is dirty, try to write it out here */
+			/*
+			 * Page is dirty. Flush the TLB if a writable entry
+			 * potentially exists to avoid CPU writes after IO
+			 * starts and then write it out here.
+			 */
+			try_to_unmap_flush_dirty();
 			switch (pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
 				goto keep_locked;
-- 
cgit v1.2.3


From c7e1e3ccfbd153c890240a391f258efaedfa94d0 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Fri, 4 Sep 2015 15:47:38 -0700
Subject: Documentation/features/vm: add feature description and arch support
 status for batched TLB flush after unmap

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/features/vm/TLB/arch-support.txt | 40 ++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 Documentation/features/vm/TLB/arch-support.txt

diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
new file mode 100644
index 000000000000..261b92e2fb1a
--- /dev/null
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -0,0 +1,40 @@
+#
+# Feature name:          batch-unmap-tlb-flush
+#         Kconfig:       ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+#         description:   arch supports deferral of TLB flush until multiple pages are unmapped
+#
+    -----------------------
+    |         arch |status|
+    -----------------------
+    |       alpha: | TODO |
+    |         arc: | TODO |
+    |         arm: | TODO |
+    |       arm64: | TODO |
+    |       avr32: |  ..  |
+    |    blackfin: | TODO |
+    |         c6x: |  ..  |
+    |        cris: |  ..  |
+    |         frv: |  ..  |
+    |       h8300: |  ..  |
+    |     hexagon: | TODO |
+    |        ia64: | TODO |
+    |        m32r: | TODO |
+    |        m68k: |  ..  |
+    |       metag: | TODO |
+    |  microblaze: |  ..  |
+    |        mips: | TODO |
+    |     mn10300: | TODO |
+    |       nios2: |  ..  |
+    |    openrisc: |  ..  |
+    |      parisc: | TODO |
+    |     powerpc: | TODO |
+    |        s390: | TODO |
+    |       score: |  ..  |
+    |          sh: | TODO |
+    |       sparc: | TODO |
+    |        tile: | TODO |
+    |          um: |  ..  |
+    |   unicore32: |  ..  |
+    |         x86: |  ok  |
+    |      xtensa: | TODO |
+    -----------------------
-- 
cgit v1.2.3


From c0a294988322a804901fe24222027fe8a34defcb Mon Sep 17 00:00:00 2001
From: Wei Yang <weiyang@linux.vnet.ibm.com>
Date: Fri, 4 Sep 2015 15:47:38 -0700
Subject: mm/memblock: WARN_ON when nid differs from overlap region

Each memblock_region has nid to indicates the Node ID of this range.  For
the overlap case, memblock_add_range() inserts the lower part and leave
the upper part as indicated in the overlapped region.

If the nid of the new range differs from the overlapped region, the
information recorded is not correct.

This patch adds a WARN_ON when the nid of the new range differs from the
overlapped region.

Signed-off-by: Wei Yang <weiyang@linux.vnet.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memblock.c b/mm/memblock.c
index 87108e77e476..95ce68c6da8a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,6 +566,9 @@ repeat:
 		 * area, insert that portion.
 		 */
 		if (rbase > base) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+			WARN_ON(nid != memblock_get_region_node(rgn));
+#endif
 			nr_new++;
 			if (insert)
 				memblock_insert_region(type, i++, base,
-- 
cgit v1.2.3


From 73858173593c31cb94bce63fe1c24eb803bb04e6 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Fri, 4 Sep 2015 15:47:43 -0700
Subject: genalloc: add name arg to gen_pool_get() and devm_gen_pool_create()

This change modifies gen_pool_get() and devm_gen_pool_create() client
interfaces adding one more argument "name" of a gen_pool object.

Due to implementation gen_pool_get() is capable to retrieve only one
gen_pool associated with a device even if multiple gen_pools are created,
fortunately right at the moment it is sufficient for the clients, hence
provide NULL as a valid argument on both producer devm_gen_pool_create()
and consumer gen_pool_get() sides.

Because only one created gen_pool per device is addressable, explicitly
add a restriction to devm_gen_pool_create() to create only one gen_pool
per device, this implies two possible error codes returned by the
function, account it on client side (only misc/sram).  This completes
client side changes related to genalloc updates.

[akpm@linux-foundation.org: gen_pool_get() cleanup]
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-at91/pm.c                   |  2 +-
 arch/arm/mach-imx/pm-imx5.c               |  2 +-
 arch/arm/mach-imx/pm-imx6.c               |  2 +-
 arch/arm/mach-socfpga/pm.c                |  2 +-
 drivers/media/platform/coda/coda-common.c |  2 +-
 drivers/misc/sram.c                       |  8 ++---
 include/linux/genalloc.h                  |  4 +--
 lib/genalloc.c                            | 49 ++++++++++++++++++-------------
 8 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index 265ffeb2037e..80e277cfcc8b 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void)
 		return;
 	}
 
-	sram_pool = gen_pool_get(&pdev->dev);
+	sram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!sram_pool) {
 		pr_warn("%s: sram pool unavailable!\n", __func__);
 		return;
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index 1885676c23c0..532d4b08276d 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram(
 		goto put_node;
 	}
 
-	ocram_pool = gen_pool_get(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c
index 93ecf559d06d..8ff8fc0b261c 100644
--- a/arch/arm/mach-imx/pm-imx6.c
+++ b/arch/arm/mach-imx/pm-imx6.c
@@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata)
 		goto put_node;
 	}
 
-	ocram_pool = gen_pool_get(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c
index 6a4199f2bffb..c378ab0c2431 100644
--- a/arch/arm/mach-socfpga/pm.c
+++ b/arch/arm/mach-socfpga/pm.c
@@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void)
 		goto put_node;
 	}
 
-	ocram_pool = gen_pool_get(&pdev->dev);
+	ocram_pool = gen_pool_get(&pdev->dev, NULL);
 	if (!ocram_pool) {
 		pr_warn("%s: ocram pool unavailable!\n", __func__);
 		ret = -ENODEV;
diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c
index 58f65486de33..284ac4c934ba 100644
--- a/drivers/media/platform/coda/coda-common.c
+++ b/drivers/media/platform/coda/coda-common.c
@@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev)
 	/* Get IRAM pool from device tree or platform data */
 	pool = of_gen_pool_get(np, "iram", 0);
 	if (!pool && pdata)
-		pool = gen_pool_get(pdata->iram_dev);
+		pool = gen_pool_get(pdata->iram_dev, NULL);
 	if (!pool) {
 		dev_err(&pdev->dev, "iram pool not available\n");
 		return -ENOMEM;
diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c
index 15c33cc34a80..431e1dd528bc 100644
--- a/drivers/misc/sram.c
+++ b/drivers/misc/sram.c
@@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev)
 	if (IS_ERR(sram->virt_base))
 		return PTR_ERR(sram->virt_base);
 
-	sram->pool = devm_gen_pool_create(sram->dev,
-					  ilog2(SRAM_GRANULARITY), -1);
-	if (!sram->pool)
-		return -ENOMEM;
+	sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY),
+					  NUMA_NO_NODE, NULL);
+	if (IS_ERR(sram->pool))
+		return PTR_ERR(sram->pool);
 
 	ret = sram_reserve_regions(sram, res);
 	if (ret)
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 5383bb1394a1..6afa65e6cdb7 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -118,8 +118,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data);
 
 extern struct gen_pool *devm_gen_pool_create(struct device *dev,
-		int min_alloc_order, int nid);
-extern struct gen_pool *gen_pool_get(struct device *dev);
+		int min_alloc_order, int nid, const char *name);
+extern struct gen_pool *gen_pool_get(struct device *dev, const char *name);
 
 bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start,
 			size_t size);
diff --git a/lib/genalloc.c b/lib/genalloc.c
index daf0afb6d979..b13cfd1a366e 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -570,24 +570,47 @@ static void devm_gen_pool_release(struct device *dev, void *res)
 	gen_pool_destroy(*(struct gen_pool **)res);
 }
 
+/**
+ * gen_pool_get - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *gen_pool_get(struct device *dev, const char *name)
+{
+	struct gen_pool **p;
+
+	p = devres_find(dev, devm_gen_pool_release, NULL, NULL);
+	if (!p)
+		return NULL;
+	return *p;
+}
+EXPORT_SYMBOL_GPL(gen_pool_get);
+
 /**
  * devm_gen_pool_create - managed gen_pool_create
  * @dev: device that provides the gen_pool
  * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
- * @nid: node id of the node the pool structure should be allocated on, or -1
+ * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes
+ * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device
  *
  * Create a new special memory pool that can be used to manage special purpose
  * memory not managed by the regular kmalloc/kfree interface. The pool will be
  * automatically destroyed by the device management code.
  */
 struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
-		int nid)
+				      int nid, const char *name)
 {
 	struct gen_pool **ptr, *pool;
 
+	/* Check that genpool to be created is uniquely addressed on device */
+	if (gen_pool_get(dev, name))
+		return ERR_PTR(-EINVAL);
+
 	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	pool = gen_pool_create(min_alloc_order, nid);
 	if (pool) {
@@ -595,29 +618,13 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 		devres_add(dev, ptr);
 	} else {
 		devres_free(ptr);
+		return ERR_PTR(-ENOMEM);
 	}
 
 	return pool;
 }
 EXPORT_SYMBOL(devm_gen_pool_create);
 
-/**
- * gen_pool_get - Obtain the gen_pool (if any) for a device
- * @dev: device to retrieve the gen_pool from
- *
- * Returns the gen_pool for the device if one is present, or NULL.
- */
-struct gen_pool *gen_pool_get(struct device *dev)
-{
-	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
-					NULL);
-
-	if (!p)
-		return NULL;
-	return *p;
-}
-EXPORT_SYMBOL_GPL(gen_pool_get);
-
 #ifdef CONFIG_OF
 /**
  * of_gen_pool_get - find a pool by phandle property
@@ -642,7 +649,7 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
 	of_node_put(np_pool);
 	if (!pdev)
 		return NULL;
-	return gen_pool_get(&pdev->dev);
+	return gen_pool_get(&pdev->dev, NULL);
 }
 EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3


From c98c36355dd6d5c4433c8d17e8eb839ca9b97606 Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Date: Fri, 4 Sep 2015 15:47:47 -0700
Subject: genalloc: add support of multiple gen_pools per device

This change fills devm_gen_pool_create()/gen_pool_get() "name" argument
stub with contents and extends of_gen_pool_get() functionality on this
basis.

If there is no associated platform device with a device node passed to
of_gen_pool_get(), the function attempts to get a label property or device
node name (= repeats MTD OF partition standard) and seeks for a named
gen_pool registered by device of the parent device node.

The main idea of the change is to allow registration of independent
gen_pools under the same umbrella device, say "partitions" on "storage
device", the original functionality of one "partition" per "storage
device" is untouched.

[akpm@linux-foundation.org: fix constness in devres_find()]
[dan.carpenter@oracle.com: freeing const data pointers]
Signed-off-by: Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>
Cc: Philipp Zabel <p.zabel@pengutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h |  2 ++
 lib/genalloc.c           | 71 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 6afa65e6cdb7..7ff168d06967 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -59,6 +59,8 @@ struct gen_pool {
 
 	genpool_algo_t algo;		/* allocation function */
 	void *data;
+
+	const char *name;
 };
 
 /*
diff --git a/lib/genalloc.c b/lib/genalloc.c
index b13cfd1a366e..116a166b096f 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -160,6 +160,7 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid)
 		pool->min_alloc_order = min_alloc_order;
 		pool->algo = gen_pool_first_fit;
 		pool->data = NULL;
+		pool->name = NULL;
 	}
 	return pool;
 }
@@ -252,8 +253,8 @@ void gen_pool_destroy(struct gen_pool *pool)
 
 		kfree(chunk);
 	}
+	kfree_const(pool->name);
 	kfree(pool);
-	return;
 }
 EXPORT_SYMBOL(gen_pool_destroy);
 
@@ -570,6 +571,20 @@ static void devm_gen_pool_release(struct device *dev, void *res)
 	gen_pool_destroy(*(struct gen_pool **)res);
 }
 
+static int devm_gen_pool_match(struct device *dev, void *res, void *data)
+{
+	struct gen_pool **p = res;
+
+	/* NULL data matches only a pool without an assigned name */
+	if (!data && !(*p)->name)
+		return 1;
+
+	if (!data || !(*p)->name)
+		return 0;
+
+	return !strcmp((*p)->name, data);
+}
+
 /**
  * gen_pool_get - Obtain the gen_pool (if any) for a device
  * @dev: device to retrieve the gen_pool from
@@ -581,7 +596,8 @@ struct gen_pool *gen_pool_get(struct device *dev, const char *name)
 {
 	struct gen_pool **p;
 
-	p = devres_find(dev, devm_gen_pool_release, NULL, NULL);
+	p = devres_find(dev, devm_gen_pool_release, devm_gen_pool_match,
+			(void *)name);
 	if (!p)
 		return NULL;
 	return *p;
@@ -603,25 +619,38 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 				      int nid, const char *name)
 {
 	struct gen_pool **ptr, *pool;
+	const char *pool_name = NULL;
 
 	/* Check that genpool to be created is uniquely addressed on device */
 	if (gen_pool_get(dev, name))
 		return ERR_PTR(-EINVAL);
 
+	if (name) {
+		pool_name = kstrdup_const(name, GFP_KERNEL);
+		if (!pool_name)
+			return ERR_PTR(-ENOMEM);
+	}
+
 	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
 	if (!ptr)
-		return ERR_PTR(-ENOMEM);
+		goto free_pool_name;
 
 	pool = gen_pool_create(min_alloc_order, nid);
-	if (pool) {
-		*ptr = pool;
-		devres_add(dev, ptr);
-	} else {
-		devres_free(ptr);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!pool)
+		goto free_devres;
+
+	*ptr = pool;
+	pool->name = pool_name;
+	devres_add(dev, ptr);
 
 	return pool;
+
+free_devres:
+	devres_free(ptr);
+free_pool_name:
+	kfree_const(pool_name);
+
+	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(devm_gen_pool_create);
 
@@ -640,16 +669,30 @@ struct gen_pool *of_gen_pool_get(struct device_node *np,
 	const char *propname, int index)
 {
 	struct platform_device *pdev;
-	struct device_node *np_pool;
+	struct device_node *np_pool, *parent;
+	const char *name = NULL;
+	struct gen_pool *pool = NULL;
 
 	np_pool = of_parse_phandle(np, propname, index);
 	if (!np_pool)
 		return NULL;
+
 	pdev = of_find_device_by_node(np_pool);
+	if (!pdev) {
+		/* Check if named gen_pool is created by parent node device */
+		parent = of_get_parent(np_pool);
+		pdev = of_find_device_by_node(parent);
+		of_node_put(parent);
+
+		of_property_read_string(np_pool, "label", &name);
+		if (!name)
+			name = np_pool->name;
+	}
+	if (pdev)
+		pool = gen_pool_get(&pdev->dev, name);
 	of_node_put(np_pool);
-	if (!pdev)
-		return NULL;
-	return gen_pool_get(&pdev->dev, NULL);
+
+	return pool;
 }
 EXPORT_SYMBOL_GPL(of_gen_pool_get);
 #endif /* CONFIG_OF */
-- 
cgit v1.2.3


From ce9ce6659a5775047ad529ed77ab119da5fb8b0c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 4 Sep 2015 15:47:50 -0700
Subject: mm: memcontrol: bring back the VM_BUG_ON() in mem_cgroup_swapout()

Clark stumbled over a VM_BUG_ON() in -RT which was then was removed by
Johannes in commit f371763a79d ("mm: memcontrol: fix false-positive
VM_BUG_ON() on -rt").  The comment before that patch was a tiny bit better
than it is now.  While the patch claimed to fix a false-postive on -RT
this was not the case.  None of the -RT folks ACKed it and it was not a
false positive report.  That was a *real* problem.

This patch updates the comment that is improper because it refers to
"disabled preemption" as a consequence of that lock being taken.  A
spin_lock() disables preemption, true, but in this case the code relies on
the fact that the lock _also_ disables interrupts once it is acquired.
And this is the important detail (which was checked the VM_BUG_ON()) which
needs to be pointed out.  This is the hint one needs while looking at the
code.  It was explained by Johannes on the list that the per-CPU variables
are protected by local_irq_save().  The BUG_ON() was helpful.  This code
has been workarounded in -RT in the meantime.  I wouldn't mind running
into more of those if the code in question uses *special* kind of locking
since now there is no verification (in terms of lockdep or BUG_ON()) and
therefore I bring the VM_BUG_ON() check back in.

The two functions after the comment could also have a "local_irq_save()"
dance around them in order to serialize access to the per-CPU variables.
This has been avoided because the interrupts should be off.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Clark Williams <williams@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index acb93c554f6e..1af057575ce9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	if (!mem_cgroup_is_root(memcg))
 		page_counter_uncharge(&memcg->memory, 1);
 
-	/* Caller disabled preemption with mapping->tree_lock */
+	/*
+	 * Interrupts should be disabled here because the caller holds the
+	 * mapping->tree_lock lock which is taken with interrupts-off. It is
+	 * important here to have the interrupts disabled because it is the
+	 * only synchronisation we have for udpating the per-CPU variables.
+	 */
+	VM_BUG_ON(!irqs_disabled());
 	mem_cgroup_charge_statistics(memcg, page, -1);
 	memcg_check_events(memcg, page);
 }
-- 
cgit v1.2.3


From d899844e9c98c9c74b4d9926fd3bd66a225f6978 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 4 Sep 2015 15:47:53 -0700
Subject: mm: fix status code which move_pages() returns for zero page

The manpage for move_pages(2) specifies that status code for zero page is
supposed to be -EFAULT.  Currently kernel return -ENOENT in this case.

follow_page() can do it for us, if we would ask for FOLL_DUMP.  The use of
FOLL_DUMP also means that the upper layer page tables pages are no longer
allocated.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index eb4267107d1f..5c08cab5419e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1226,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 		if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
 			goto set_status;
 
-		page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+		/* FOLL_DUMP to ignore special (like zero) pages */
+		page = follow_page(vma, pp->addr,
+				FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
 
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
@@ -1236,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 		if (!page)
 			goto set_status;
 
-		/* Use PageReserved to check for zero page */
-		if (PageReserved(page))
-			goto put_and_set;
-
 		pp->page = page;
 		err = page_to_nid(page);
 
@@ -1396,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 		if (!vma || addr < vma->vm_start)
 			goto set_status;
 
-		page = follow_page(vma, addr, 0);
+		/* FOLL_DUMP to ignore special (like zero) pages */
+		page = follow_page(vma, addr, FOLL_DUMP);
 
 		err = PTR_ERR(page);
 		if (IS_ERR(page))
 			goto set_status;
 
-		err = -ENOENT;
-		/* Use PageReserved to check for zero page */
-		if (!page || PageReserved(page))
-			goto set_status;
-
-		err = page_to_nid(page);
+		err = page ? page_to_nid(page) : -ENOENT;
 set_status:
 		*status = err;
 
-- 
cgit v1.2.3


From 1027e4436b6a5c413c95d95e50d0f26348a602ac Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 4 Sep 2015 15:47:55 -0700
Subject: mm: make GUP handle pfn mapping unless FOLL_GET is requested

With DAX, pfn mapping becoming more common.  The patch adjusts GUP code to
cover pfn mapping for cases when we don't need struct page to proceed.

To make it possible, let's change follow_page() code to return -EEXIST
error code if proper page table entry exists, but no corresponding struct
page.  __get_user_page() would ignore the error code and move to the next
page frame.

The immediate effect of the change is working MAP_POPULATE and mlock() on
DAX mappings.

[akpm@linux-foundation.org: fix arm64 build]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
Acked-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 10 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 6297f6bccfb1..a798293fc648 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -12,7 +12,9 @@
 #include <linux/sched.h>
 #include <linux/rwsem.h>
 #include <linux/hugetlb.h>
+
 #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
 
 #include "internal.h"
 
@@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma,
 	return NULL;
 }
 
+static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
+		pte_t *pte, unsigned int flags)
+{
+	/* No page to get reference */
+	if (flags & FOLL_GET)
+		return -EFAULT;
+
+	if (flags & FOLL_TOUCH) {
+		pte_t entry = *pte;
+
+		if (flags & FOLL_WRITE)
+			entry = pte_mkdirty(entry);
+		entry = pte_mkyoung(entry);
+
+		if (!pte_same(*pte, entry)) {
+			set_pte_at(vma->vm_mm, address, pte, entry);
+			update_mmu_cache(vma, address, pte);
+		}
+	}
+
+	/* Proper page table entry exists, but no corresponding struct page */
+	return -EEXIST;
+}
+
 static struct page *follow_page_pte(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd, unsigned int flags)
 {
@@ -73,10 +99,21 @@ retry:
 
 	page = vm_normal_page(vma, address, pte);
 	if (unlikely(!page)) {
-		if ((flags & FOLL_DUMP) ||
-		    !is_zero_pfn(pte_pfn(pte)))
-			goto bad_page;
-		page = pte_page(pte);
+		if (flags & FOLL_DUMP) {
+			/* Avoid special (like zero) pages in core dumps */
+			page = ERR_PTR(-EFAULT);
+			goto out;
+		}
+
+		if (is_zero_pfn(pte_pfn(pte))) {
+			page = pte_page(pte);
+		} else {
+			int ret;
+
+			ret = follow_pfn_pte(vma, address, ptep, flags);
+			page = ERR_PTR(ret);
+			goto out;
+		}
 	}
 
 	if (flags & FOLL_GET)
@@ -114,12 +151,9 @@ retry:
 			unlock_page(page);
 		}
 	}
+out:
 	pte_unmap_unlock(ptep, ptl);
 	return page;
-bad_page:
-	pte_unmap_unlock(ptep, ptl);
-	return ERR_PTR(-EFAULT);
-
 no_page:
 	pte_unmap_unlock(ptep, ptl);
 	if (!pte_none(pte))
@@ -489,9 +523,15 @@ retry:
 				goto next_page;
 			}
 			BUG();
-		}
-		if (IS_ERR(page))
+		} else if (PTR_ERR(page) == -EEXIST) {
+			/*
+			 * Proper page table entry exists, but no corresponding
+			 * struct page.
+			 */
+			goto next_page;
+		} else if (IS_ERR(page)) {
 			return i ? i : PTR_ERR(page);
+		}
 		if (pages) {
 			pages[i] = page;
 			flush_anon_page(vma, page, start);
-- 
cgit v1.2.3


From 31aafb45f4e1c34e4aba37c150ae4e74880b46ed Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Fri, 4 Sep 2015 15:47:58 -0700
Subject: mm/hugetlb.c: make vma_shareable() return bool

This makes vma_shareable() return bool now due to this particular function
only ever returning either one or zero as its return value.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Acked-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a8c3087089d8..5d1d84ca9674 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
 	return saddr;
 }
 
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 {
 	unsigned long base = addr & PUD_MASK;
 	unsigned long end = base + PUD_SIZE;
@@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 	 */
 	if (vma->vm_flags & VM_MAYSHARE &&
 	    vma->vm_start <= base && end <= vma->vm_end)
-		return 1;
-	return 0;
+		return true;
+	return false;
 }
 
 /*
-- 
cgit v1.2.3


From df1eab303c836c98a79de427aa1c7d6812acaaa9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Sep 2015 15:48:01 -0700
Subject: mremap: don't leak new_vma if f_op->mremap() fails

move_vma() can't just return if f_op->mremap() fails, we should unmap the
new vma like we do if move_page_tables() fails.  To avoid the code
duplication this patch moves the "move entries back" under the new "if
(err)" branch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index a7c93eceb1c8..f54a43fa4b79 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 				     need_rmap_locks);
 	if (moved_len < old_len) {
+		err = -ENOMEM;
+	} else if (vma->vm_file && vma->vm_file->f_op->mremap) {
+		err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+	}
+
+	if (unlikely(err)) {
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
@@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		vma = new_vma;
 		old_len = new_len;
 		old_addr = new_addr;
-		new_addr = -ENOMEM;
+		new_addr = err;
 	} else {
-		if (vma->vm_file && vma->vm_file->f_op->mremap) {
-			err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
-			if (err < 0) {
-				move_page_tables(new_vma, new_addr, vma,
-						 old_addr, moved_len, true);
-				return err;
-			}
-		}
 		arch_remap(mm, old_addr, old_addr + old_len,
 			   new_addr, new_addr + new_len);
 	}
-- 
cgit v1.2.3


From 5477e70a6420a6b7ca96c8e21413ee1c96a84260 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Sep 2015 15:48:04 -0700
Subject: mm: move ->mremap() from file_operations to vm_operations_struct

vma->vm_ops->mremap() looks more natural and clean in move_vma(), and this
way ->mremap() can have more users.  Say, vdso.

While at it, s/aio_ring_remap/aio_ring_mremap/.

Note: this is the minimal change before ->mremap() finds another user in
file_operations; this method should have more arguments, and it can be
used to kill arch_remap().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c           | 27 ++++++++++++++++++---------
 include/linux/fs.h |  1 -
 include/linux/mm.h |  1 +
 mm/mremap.c        |  4 ++--
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 480440f4701f..155f84253f33 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -308,15 +308,9 @@ static void aio_free_ring(struct kioctx *ctx)
 	}
 }
 
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	vma->vm_flags |= VM_DONTEXPAND;
-	vma->vm_ops = &generic_file_vm_ops;
-	return 0;
-}
-
-static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma)
 {
+	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct kioctx_table *table;
 	int i, res = -EINVAL;
@@ -342,9 +336,24 @@ static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
 	return res;
 }
 
+static const struct vm_operations_struct aio_ring_vm_ops = {
+	.mremap		= aio_ring_mremap,
+#if IS_ENABLED(CONFIG_MMU)
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+	.page_mkwrite	= filemap_page_mkwrite,
+#endif
+};
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_flags |= VM_DONTEXPAND;
+	vma->vm_ops = &aio_ring_vm_ops;
+	return 0;
+}
+
 static const struct file_operations aio_ring_fops = {
 	.mmap = aio_ring_mmap,
-	.mremap = aio_ring_remap,
 };
 
 #if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fbd780c33c5f..864203c10dbc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,7 +1612,6 @@ struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-	int (*mremap)(struct file *, struct vm_area_struct *);
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 77a9d609523e..8b257c43855b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -247,6 +247,7 @@ struct vm_fault {
 struct vm_operations_struct {
 	void (*open)(struct vm_area_struct * area);
 	void (*close)(struct vm_area_struct * area);
+	int (*mremap)(struct vm_area_struct * area);
 	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
 	void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
 
diff --git a/mm/mremap.c b/mm/mremap.c
index f54a43fa4b79..3310378bb60f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -277,8 +277,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 				     need_rmap_locks);
 	if (moved_len < old_len) {
 		err = -ENOMEM;
-	} else if (vma->vm_file && vma->vm_file->f_op->mremap) {
-		err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+	} else if (vma->vm_ops && vma->vm_ops->mremap) {
+		err = vma->vm_ops->mremap(new_vma);
 	}
 
 	if (unlikely(err)) {
-- 
cgit v1.2.3


From d456fb9e5254df433d4806769d7ff75d80d66aa4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Sep 2015 15:48:07 -0700
Subject: mremap: don't do mm_populate(new_addr) on failure

move_vma() sets *locked even if move_page_tables() or ->mremap() fails,
change sys_mremap() to check "ret & ~PAGE_MASK".

I think we should simply remove the VM_LOCKED code in move_vma(), that is
why this patch doesn't change move_vma().  But this needs more cleanups.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 3310378bb60f..7dcf7b42068e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -578,8 +578,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked);
 	}
 out:
-	if (ret & ~PAGE_MASK)
+	if (ret & ~PAGE_MASK) {
 		vm_unacct_memory(charged);
+		locked = 0;
+	}
 	up_write(&current->mm->mmap_sem);
 	if (locked && new_len > old_len)
 		mm_populate(new_addr + old_len, new_len - old_len);
-- 
cgit v1.2.3


From 1d3916869798755968b3cd764ab21f2bb86ffff7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Sep 2015 15:48:10 -0700
Subject: mremap: don't do uneccesary checks if new_len == old_len

The "new_len > old_len" branch in vma_to_resize() looks very confusing.
It only covers the VM_DONTEXPAND/pgoff checks but everything below is
equally unneeded if new_len == old_len.

Change this code to return if "new_len == old_len", new_len < old_len is
not possible, otherwise the code below is wrong anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 7dcf7b42068e..d3f42bece564 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -346,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma = find_vma(mm, addr);
+	unsigned long pgoff;
 
 	if (!vma || vma->vm_start > addr)
 		return ERR_PTR(-EFAULT);
@@ -357,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 	if (old_len > vma->vm_end - addr)
 		return ERR_PTR(-EFAULT);
 
+	if (new_len == old_len)
+		return vma;
+
 	/* Need to be careful about a growing mapping */
-	if (new_len > old_len) {
-		unsigned long pgoff;
-
-		if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
-			return ERR_PTR(-EFAULT);
-		pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
-		pgoff += vma->vm_pgoff;
-		if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
-			return ERR_PTR(-EINVAL);
-	}
+	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+	pgoff += vma->vm_pgoff;
+	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+		return ERR_PTR(-EINVAL);
+
+	if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
+		return ERR_PTR(-EFAULT);
 
 	if (vma->vm_flags & VM_LOCKED) {
 		unsigned long locked, lock_limit;
-- 
cgit v1.2.3


From 9943242ca468149c4ce30d4633524c0866d4a87b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 4 Sep 2015 15:48:13 -0700
Subject: mremap: simplify the "overlap" check in mremap_to()

Minor, but this check is overcomplicated.  Two half-intervals do NOT
overlap if END1 <= START2 || END2 <= START1, mremap_to() just needs to
negate this check.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index d3f42bece564..5a71cce8c6ea 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -407,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
 		goto out;
 
-	/* Check if the location we're moving into overlaps the
-	 * old location at all, and fail if it does.
-	 */
-	if ((new_addr <= addr) && (new_addr+new_len) > addr)
-		goto out;
-
-	if ((addr <= new_addr) && (addr+old_len) > new_addr)
+	/* Ensure the old/new locations do not overlap */
+	if (addr + old_len > new_addr && new_addr + new_len > addr)
 		goto out;
 
 	ret = do_munmap(mm, new_addr, new_len);
-- 
cgit v1.2.3


From 4e6dab4233f667c0ae465e5cb46603b49b4f6d74 Mon Sep 17 00:00:00 2001
From: "minkyung88.kim" <minkyung88.kim@lge.com>
Date: Fri, 4 Sep 2015 15:48:16 -0700
Subject: mm: remove struct node_active_region

struct node_active_region is not used anymore.  Remove it.

Signed-off-by: minkyung88.kim <minkyung88.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 754c25966a0a..ac00e2050943 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -690,14 +690,6 @@ struct zonelist {
 #endif
 };
 
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-struct node_active_region {
-	unsigned long start_pfn;
-	unsigned long end_pfn;
-	int nid;
-};
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
 #ifndef CONFIG_DISCONTIGMEM
 /* The array of struct pages - for discontigmem use pgdat->lmem_map */
 extern struct page *mem_map;
-- 
cgit v1.2.3


From d9e7e37b4d83371d08650612e0bb0b80a1240289 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Fri, 4 Sep 2015 15:48:19 -0700
Subject: mm/dmapool.c: change is_page_busy() return from int to bool

This makes the function is_page_busy() return bool rather then an int now
due to this particular function's single return statement only ever
evaulating to either one or zero.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/dmapool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/dmapool.c b/mm/dmapool.c
index fd5fe4342e93..59d10d16f0a5 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
 	return page;
 }
 
-static inline int is_page_busy(struct dma_page *page)
+static inline bool is_page_busy(struct dma_page *page)
 {
 	return page->in_use != 0;
 }
-- 
cgit v1.2.3


From ca1d6c7d9d461effa2c4e8b9b227a14e9fdcf1cc Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Fri, 4 Sep 2015 15:48:22 -0700
Subject: mm/memory.c: make tlb_next_batch() return bool

This makes the tlb_next_batch() bool due to this particular function only
ever returning either one or zero as its return value.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 2961fb654369..bb04d8f2f86c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -181,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
 
 #ifdef HAVE_GENERIC_MMU_GATHER
 
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
 {
 	struct mmu_gather_batch *batch;
 
 	batch = tlb->active;
 	if (batch->next) {
 		tlb->active = batch->next;
-		return 1;
+		return true;
 	}
 
 	if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
-		return 0;
+		return false;
 
 	batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
 	if (!batch)
-		return 0;
+		return false;
 
 	tlb->batch_count++;
 	batch->next = NULL;
@@ -206,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
 	tlb->active->next = batch;
 	tlb->active = batch;
 
-	return 1;
+	return true;
 }
 
 /* tlb_gather_mmu
-- 
cgit v1.2.3


From 1ecef9ed0f63bfff58895a4f3aec751e907c7f3d Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Fri, 4 Sep 2015 15:48:24 -0700
Subject: mm/madvise.c: make madvise_behaviour_valid() return bool

This makes the madvise_bahaviour_valid() function return bool due to
this particular function always returning the value of either one or
zero as its return value.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 911357973905..ce3a4222c7e7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -386,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	}
 }
 
-static int
+static bool
 madvise_behavior_valid(int behavior)
 {
 	switch (behavior) {
@@ -408,10 +408,10 @@ madvise_behavior_valid(int behavior)
 #endif
 	case MADV_DONTDUMP:
 	case MADV_DODUMP:
-		return 1;
+		return true;
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From 559ec2f8fd50981821621f52db5e1a8ffcf8d792 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Fri, 4 Sep 2015 15:48:27 -0700
Subject: mm/hugetlb.c: make vma_has_reserves() return bool

This makes vma_has_reserves() return bool due to this particular function
only returning either one or zero as its return value.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d1d84ca9674..51ae41d0fbc0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 }
 
 /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 {
 	if (vma->vm_flags & VM_NORESERVE) {
 		/*
@@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
 		 * properly, so add work-around here.
 		 */
 		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
-			return 1;
+			return true;
 		else
-			return 0;
+			return false;
 	}
 
 	/* Shared mappings always use reserves */
 	if (vma->vm_flags & VM_MAYSHARE)
-		return 1;
+		return true;
 
 	/*
 	 * Only the process that called mmap() has reserves for
 	 * private mappings.
 	 */
 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
 static void enqueue_huge_page(struct hstate *h, struct page *page)
-- 
cgit v1.2.3