From 245d73698ed7abdc7e520dfa38048bb80ce89571 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 2 Oct 2019 16:41:58 -0700
Subject: audit: Report suspicious O_CREAT usage

This renames the very specific audit_log_link_denied() to
audit_log_path_denied() and adds the AUDIT_* type as an argument. This
allows for the creation of the new AUDIT_ANOM_CREAT that can be used to
report the fifo/regular file creation restrictions that were introduced
in commit 30aba6656f61 ("namei: allow restricted O_CREAT of FIFOs and
regular files").

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index da8dc0db5bd3..d75485aa25ff 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2155,18 +2155,19 @@ void audit_log_task_info(struct audit_buffer *ab)
 EXPORT_SYMBOL(audit_log_task_info);
 
 /**
- * audit_log_link_denied - report a link restriction denial
- * @operation: specific link operation
+ * audit_log_path_denied - report a path restriction denial
+ * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
+ * @operation: specific operation name
  */
-void audit_log_link_denied(const char *operation)
+void audit_log_path_denied(int type, const char *operation)
 {
 	struct audit_buffer *ab;
 
 	if (!audit_enabled || audit_dummy_context())
 		return;
 
-	/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
-	ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK);
+	/* Generate log with subject, operation, outcome. */
+	ab = audit_log_start(audit_context(), GFP_KERNEL, type);
 	if (!ab)
 		return;
 	audit_log_format(ab, "op=%s", operation);
-- 
cgit v1.2.3


From 0f8b5b6d56b5fa4085c06945ea3e1ee5941ecfeb Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 25 Sep 2019 13:02:17 -0700
Subject: kgdb: Remove unused DCPU_SSTEP definition

From doing a 'git log --patch kernel/debug', it looks as if DCPU_SSTEP
has never been used.  Presumably it used to be used back when kgdb was
out of tree and nobody thought to delete the definition when the usage
went away.  Delete.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Jason Wessel <jason.wessel@windriver.com>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/debug_core.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index b4a7c326d546..804b0fe5a0ba 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -33,7 +33,6 @@ struct kgdb_state {
 #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
 #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
 #define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
-#define DCPU_SSTEP       0x8 /* CPU is single stepping */
 
 struct debuggerinfo_struct {
 	void			*debuggerinfo;
-- 
cgit v1.2.3


From 54af3e39eed7d77f0923511f3c7f446e7d477635 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 25 Sep 2019 13:02:18 -0700
Subject: kdb: Remove unused "argcount" param from kdb_bt1(); make btaprompt
 bool

The kdb_bt1() had a mysterious "argcount" parameter passed in (always
the number 5, by the way) and never used.  Presumably this is just old
cruft.  Remove it.  While at it, upgrade the btaprompt parameter to a
full fledged bool instead of an int.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_bt.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7e2379aa0a1e..120fc686c919 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -78,8 +78,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
  */
 
 static int
-kdb_bt1(struct task_struct *p, unsigned long mask,
-	int argcount, int btaprompt)
+kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
 {
 	char buffer[2];
 	if (kdb_getarea(buffer[0], (unsigned long)p) ||
@@ -106,7 +105,6 @@ int
 kdb_bt(int argc, const char **argv)
 {
 	int diag;
-	int argcount = 5;
 	int btaprompt = 1;
 	int nextarg;
 	unsigned long addr;
@@ -125,7 +123,7 @@ kdb_bt(int argc, const char **argv)
 		/* Run the active tasks first */
 		for_each_online_cpu(cpu) {
 			p = kdb_curr_task(cpu);
-			if (kdb_bt1(p, mask, argcount, btaprompt))
+			if (kdb_bt1(p, mask, btaprompt))
 				return 0;
 		}
 		/* Now the inactive tasks */
@@ -134,7 +132,7 @@ kdb_bt(int argc, const char **argv)
 				return 0;
 			if (task_curr(p))
 				continue;
-			if (kdb_bt1(p, mask, argcount, btaprompt))
+			if (kdb_bt1(p, mask, btaprompt))
 				return 0;
 		} kdb_while_each_thread(g, p);
 	} else if (strcmp(argv[0], "btp") == 0) {
@@ -148,7 +146,7 @@ kdb_bt(int argc, const char **argv)
 		p = find_task_by_pid_ns(pid, &init_pid_ns);
 		if (p) {
 			kdb_set_current_task(p);
-			return kdb_bt1(p, ~0UL, argcount, 0);
+			return kdb_bt1(p, ~0UL, false);
 		}
 		kdb_printf("No process with pid == %ld found\n", pid);
 		return 0;
@@ -159,7 +157,7 @@ kdb_bt(int argc, const char **argv)
 		if (diag)
 			return diag;
 		kdb_set_current_task((struct task_struct *)addr);
-		return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
+		return kdb_bt1((struct task_struct *)addr, ~0UL, false);
 	} else if (strcmp(argv[0], "btc") == 0) {
 		unsigned long cpu = ~0;
 		struct task_struct *save_current_task = kdb_current_task;
@@ -211,7 +209,7 @@ kdb_bt(int argc, const char **argv)
 			kdb_show_stack(kdb_current_task, (void *)addr);
 			return 0;
 		} else {
-			return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
+			return kdb_bt1(kdb_current_task, ~0UL, false);
 		}
 	}
 
-- 
cgit v1.2.3


From 55a7e23f461fc2c321d7efcdeca1750085e9323f Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 25 Sep 2019 13:02:19 -0700
Subject: kdb: Fix "btc <cpu>" crash if the CPU didn't round up

I noticed that when I did "btc <cpu>" and the CPU I passed in hadn't
rounded up that I'd crash.  I was going to copy the same fix from
commit 162bc7f5afd7 ("kdb: Don't back trace on a cpu that didn't round
up") into the "not all the CPUs" case, but decided it'd be better to
clean things up a little bit.

This consolidates the two code paths.  It is _slightly_ wasteful in in
that the checks for "cpu" being too small or being offline isn't
really needed when we're iterating over all online CPUs, but that
really shouldn't hurt.  Better to have the same code path.

While at it, eliminate at least one slightly ugly (and totally
needless) recursive use of kdb_parse().

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/kdb/kdb_bt.c | 61 ++++++++++++++++++++++++++---------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 120fc686c919..d9af139f9a31 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -101,6 +101,27 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
 	return 0;
 }
 
+static void
+kdb_bt_cpu(unsigned long cpu)
+{
+	struct task_struct *kdb_tsk;
+
+	if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
+		kdb_printf("WARNING: no process for cpu %ld\n", cpu);
+		return;
+	}
+
+	/* If a CPU failed to round up we could be here */
+	kdb_tsk = KDB_TSK(cpu);
+	if (!kdb_tsk) {
+		kdb_printf("WARNING: no task for cpu %ld\n", cpu);
+		return;
+	}
+
+	kdb_set_current_task(kdb_tsk);
+	kdb_bt1(kdb_tsk, ~0UL, false);
+}
+
 int
 kdb_bt(int argc, const char **argv)
 {
@@ -161,7 +182,6 @@ kdb_bt(int argc, const char **argv)
 	} else if (strcmp(argv[0], "btc") == 0) {
 		unsigned long cpu = ~0;
 		struct task_struct *save_current_task = kdb_current_task;
-		char buf[80];
 		if (argc > 1)
 			return KDB_ARGCOUNT;
 		if (argc == 1) {
@@ -169,35 +189,22 @@ kdb_bt(int argc, const char **argv)
 			if (diag)
 				return diag;
 		}
-		/* Recursive use of kdb_parse, do not use argv after
-		 * this point */
-		argv = NULL;
 		if (cpu != ~0) {
-			if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
-				kdb_printf("no process for cpu %ld\n", cpu);
-				return 0;
-			}
-			sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
-			kdb_parse(buf);
-			return 0;
-		}
-		kdb_printf("btc: cpu status: ");
-		kdb_parse("cpu\n");
-		for_each_online_cpu(cpu) {
-			void *kdb_tsk = KDB_TSK(cpu);
-
-			/* If a CPU failed to round up we could be here */
-			if (!kdb_tsk) {
-				kdb_printf("WARNING: no task for cpu %ld\n",
-					   cpu);
-				continue;
+			kdb_bt_cpu(cpu);
+		} else {
+			/*
+			 * Recursive use of kdb_parse, do not use argv after
+			 * this point.
+			 */
+			argv = NULL;
+			kdb_printf("btc: cpu status: ");
+			kdb_parse("cpu\n");
+			for_each_online_cpu(cpu) {
+				kdb_bt_cpu(cpu);
+				touch_nmi_watchdog();
 			}
-
-			sprintf(buf, "btt 0x%px\n", kdb_tsk);
-			kdb_parse(buf);
-			touch_nmi_watchdog();
+			kdb_set_current_task(save_current_task);
 		}
-		kdb_set_current_task(save_current_task);
 		return 0;
 	} else {
 		if (argc) {
-- 
cgit v1.2.3


From 2277b492582d5525244519f60da6f9daea5ef41a Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Wed, 25 Sep 2019 13:02:20 -0700
Subject: kdb: Fix stack crawling on 'running' CPUs that aren't the master

In kdb when you do 'btc' (back trace on CPU) it doesn't necessarily
give you the right info.  Specifically on many architectures
(including arm64, where I tested) you can't dump the stack of a
"running" process that isn't the process running on the current CPU.
This can be seen by this:

 echo SOFTLOCKUP > /sys/kernel/debug/provoke-crash/DIRECT
 # wait 2 seconds
 <sysrq>g

Here's what I see now on rk3399-gru-kevin.  I see the stack crawl for
the CPU that handled the sysrq but everything else just shows me stuck
in __switch_to() which is bogus:

======

[0]kdb> btc
btc: cpu status: Currently on cpu 0
Available cpus: 0, 1-3(I), 4, 5(I)
Stack traceback for pid 0
0xffffff801101a9c0        0        0  1    0   R  0xffffff801101b3b0 *swapper/0
Call trace:
 dump_backtrace+0x0/0x138
 ...
 kgdb_compiled_brk_fn+0x34/0x44
 ...
 sysrq_handle_dbg+0x34/0x5c
Stack traceback for pid 0
0xffffffc0f175a040        0        0  1    1   I  0xffffffc0f175aa30  swapper/1
Call trace:
 __switch_to+0x1e4/0x240
 0xffffffc0f65616c0
Stack traceback for pid 0
0xffffffc0f175d040        0        0  1    2   I  0xffffffc0f175da30  swapper/2
Call trace:
 __switch_to+0x1e4/0x240
 0xffffffc0f65806c0
Stack traceback for pid 0
0xffffffc0f175b040        0        0  1    3   I  0xffffffc0f175ba30  swapper/3
Call trace:
 __switch_to+0x1e4/0x240
 0xffffffc0f659f6c0
Stack traceback for pid 1474
0xffffffc0dde8b040     1474      727  1    4   R  0xffffffc0dde8ba30  bash
Call trace:
 __switch_to+0x1e4/0x240
 __schedule+0x464/0x618
 0xffffffc0dde8b040
Stack traceback for pid 0
0xffffffc0f17b0040        0        0  1    5   I  0xffffffc0f17b0a30  swapper/5
Call trace:
 __switch_to+0x1e4/0x240
 0xffffffc0f65dd6c0

===

The problem is that 'btc' eventually boils down to
  show_stack(task_struct, NULL);

...and show_stack() doesn't work for "running" CPUs because their
registers haven't been stashed.

On x86 things might work better (I haven't tested) because kdb has a
special case for x86 in kdb_show_stack() where it passes the stack
pointer to show_stack().  This wouldn't work on arm64 where the stack
crawling function seems needs the "fp" and "pc", not the "sp" which is
presumably why arm64's show_stack() function totally ignores the "sp"
parameter.

NOTE: we _can_ get a good stack dump for all the cpus if we manually
switch each one to the kdb master and do a back trace.  AKA:
  cpu 4
  bt
...will give the expected trace.  That's because now arm64's
dump_backtrace will now see that "tsk == current" and go through a
different path.

In this patch I fix the problems by catching a request to stack crawl
a task that's running on a CPU and then I ask that CPU to do the stack
crawl.

NOTE: this will (presumably) change what stack crawls are printed for
x86 machines.  Now kdb functions will show up in the stack crawl.
Presumably this is OK but if it's not we can go back and add a special
case for x86 again.

Signed-off-by: Douglas Anderson <dianders@chromium.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---
 kernel/debug/debug_core.c | 34 ++++++++++++++++++++++++++++++++++
 kernel/debug/debug_core.h |  2 ++
 kernel/debug/kdb/kdb_bt.c | 19 +++++++------------
 3 files changed, 43 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index f76d6f77dd5e..70e86b4b4932 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -441,6 +441,37 @@ setundefined:
 	return 0;
 }
 
+#ifdef CONFIG_KGDB_KDB
+void kdb_dump_stack_on_cpu(int cpu)
+{
+	if (cpu == raw_smp_processor_id()) {
+		dump_stack();
+		return;
+	}
+
+	if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) {
+		kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n",
+			   cpu);
+		return;
+	}
+
+	/*
+	 * In general, architectures don't support dumping the stack of a
+	 * "running" process that's not the current one.  From the point of
+	 * view of the Linux, kernel processes that are looping in the kgdb
+	 * slave loop are still "running".  There's also no API (that actually
+	 * works across all architectures) that can do a stack crawl based
+	 * on registers passed as a parameter.
+	 *
+	 * Solve this conundrum by asking slave CPUs to do the backtrace
+	 * themselves.
+	 */
+	kgdb_info[cpu].exception_state |= DCPU_WANT_BT;
+	while (kgdb_info[cpu].exception_state & DCPU_WANT_BT)
+		cpu_relax();
+}
+#endif
+
 /*
  * Return true if there is a valid kgdb I/O module.  Also if no
  * debugger is attached a message can be printed to the console about
@@ -580,6 +611,9 @@ cpu_loop:
 				atomic_xchg(&kgdb_active, cpu);
 				break;
 			}
+		} else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) {
+			dump_stack();
+			kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT;
 		} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
 			if (!raw_spin_is_locked(&dbg_slave_lock))
 				goto return_normal;
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index 804b0fe5a0ba..cd22b5f68831 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -33,6 +33,7 @@ struct kgdb_state {
 #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
 #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
 #define DCPU_IS_SLAVE    0x4 /* Slave cpu enter exception */
+#define DCPU_WANT_BT     0x8 /* Slave cpu should backtrace then clear flag */
 
 struct debuggerinfo_struct {
 	void			*debuggerinfo;
@@ -75,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks);
 extern int kdb_parse(const char *cmdstr);
 extern int kdb_common_init_state(struct kgdb_state *ks);
 extern int kdb_common_deinit_state(void);
+extern void kdb_dump_stack_on_cpu(int cpu);
 #else /* ! CONFIG_KGDB_KDB */
 static inline int kdb_stub(struct kgdb_state *ks)
 {
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index d9af139f9a31..0e94efe07b72 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -22,20 +22,15 @@
 static void kdb_show_stack(struct task_struct *p, void *addr)
 {
 	int old_lvl = console_loglevel;
+
 	console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
 	kdb_trap_printk++;
-	kdb_set_current_task(p);
-	if (addr) {
-		show_stack((struct task_struct *)p, addr);
-	} else if (kdb_current_regs) {
-#ifdef CONFIG_X86
-		show_stack(p, &kdb_current_regs->sp);
-#else
-		show_stack(p, NULL);
-#endif
-	} else {
-		show_stack(p, NULL);
-	}
+
+	if (!addr && kdb_task_has_cpu(p))
+		kdb_dump_stack_on_cpu(kdb_process_cpu(p));
+	else
+		show_stack(p, addr);
+
 	console_loglevel = old_lvl;
 	kdb_trap_printk--;
 }
-- 
cgit v1.2.3


From fb3c5386b382d4097476ce9647260fc89b34afdb Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Fri, 20 Sep 2019 10:30:05 +0200
Subject: seccomp: add SECCOMP_USER_NOTIF_FLAG_CONTINUE

This allows the seccomp notifier to continue a syscall. A positive
discussion about this feature was triggered by a post to the
ksummit-discuss mailing list (cf. [3]) and took place during KSummit
(cf. [1]) and again at the containers/checkpoint-restore
micro-conference at Linux Plumbers.

Recently we landed seccomp support for SECCOMP_RET_USER_NOTIF (cf. [4])
which enables a process (watchee) to retrieve an fd for its seccomp
filter. This fd can then be handed to another (usually more privileged)
process (watcher). The watcher will then be able to receive seccomp
messages about the syscalls having been performed by the watchee.

This feature is heavily used in some userspace workloads. For example,
it is currently used to intercept mknod() syscalls in user namespaces
aka in containers.
The mknod() syscall can be easily filtered based on dev_t. This allows
us to only intercept a very specific subset of mknod() syscalls.
Furthermore, mknod() is not possible in user namespaces toto coelo and
so intercepting and denying syscalls that are not in the whitelist on
accident is not a big deal. The watchee won't notice a difference.

In contrast to mknod(), a lot of other syscall we intercept (e.g.
setxattr()) cannot be easily filtered like mknod() because they have
pointer arguments. Additionally, some of them might actually succeed in
user namespaces (e.g. setxattr() for all "user.*" xattrs). Since we
currently cannot tell seccomp to continue from a user notifier we are
stuck with performing all of the syscalls in lieu of the container. This
is a huge security liability since it is extremely difficult to
correctly assume all of the necessary privileges of the calling task
such that the syscall can be successfully emulated without escaping
other additional security restrictions (think missing CAP_MKNOD for
mknod(), or MS_NODEV on a filesystem etc.). This can be solved by
telling seccomp to resume the syscall.

One thing that came up in the discussion was the problem that another
thread could change the memory after userspace has decided to let the
syscall continue which is a well known TOCTOU with seccomp which is
present in other ways already.
The discussion showed that this feature is already very useful for any
syscall without pointer arguments. For any accidentally intercepted
non-pointer syscall it is safe to continue.
For syscalls with pointer arguments there is a race but for any cautious
userspace and the main usec cases the race doesn't matter. The notifier
is intended to be used in a scenario where a more privileged watcher
supervises the syscalls of lesser privileged watchee to allow it to get
around kernel-enforced limitations by performing the syscall for it
whenever deemed save by the watcher. Hence, if a user tricks the watcher
into allowing a syscall they will either get a deny based on
kernel-enforced restrictions later or they will have changed the
arguments in such a way that they manage to perform a syscall with
arguments that they would've been allowed to do anyway.
In general, it is good to point out again, that the notifier fd was not
intended to allow userspace to implement a security policy but rather to
work around kernel security mechanisms in cases where the watcher knows
that a given action is safe to perform.

/* References */
[1]: https://linuxplumbersconf.org/event/4/contributions/560
[2]: https://linuxplumbersconf.org/event/4/contributions/477
[3]: https://lore.kernel.org/r/20190719093538.dhyopljyr5ns33qx@brauner.io
[4]: commit 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace")

Co-developed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Tycho Andersen <tycho@tycho.ws>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
CC: Tyler Hicks <tyhicks@canonical.com>
Link: https://lore.kernel.org/r/20190920083007.11475-2-christian.brauner@ubuntu.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/seccomp.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dba52a7db5e8..12d2227e5786 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -75,6 +75,7 @@ struct seccomp_knotif {
 	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
 	int error;
 	long val;
+	u32 flags;
 
 	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
 	struct completion ready;
@@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
 	return filter->notif->next_id++;
 }
 
-static void seccomp_do_user_notification(int this_syscall,
-					 struct seccomp_filter *match,
-					 const struct seccomp_data *sd)
+static int seccomp_do_user_notification(int this_syscall,
+					struct seccomp_filter *match,
+					const struct seccomp_data *sd)
 {
 	int err;
+	u32 flags = 0;
 	long ret = 0;
 	struct seccomp_knotif n = {};
 
@@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall,
 	if (err == 0) {
 		ret = n.val;
 		err = n.error;
+		flags = n.flags;
 	}
 
 	/*
@@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall,
 		list_del(&n.list);
 out:
 	mutex_unlock(&match->notify_lock);
+
+	/* Userspace requests to continue the syscall. */
+	if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+		return 0;
+
 	syscall_set_return_value(current, task_pt_regs(current),
 				 err, ret);
+	return -1;
 }
 
 static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
@@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		return 0;
 
 	case SECCOMP_RET_USER_NOTIF:
-		seccomp_do_user_notification(this_syscall, match, sd);
-		goto skip;
+		if (seccomp_do_user_notification(this_syscall, match, sd))
+			goto skip;
+
+		return 0;
 
 	case SECCOMP_RET_LOG:
 		seccomp_log(this_syscall, 0, action, true);
@@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
 	if (copy_from_user(&resp, buf, sizeof(resp)))
 		return -EFAULT;
 
-	if (resp.flags)
+	if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE)
+		return -EINVAL;
+
+	if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) &&
+	    (resp.error || resp.val))
 		return -EINVAL;
 
 	ret = mutex_lock_interruptible(&filter->notify_lock);
@@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
 	knotif->state = SECCOMP_NOTIFY_REPLIED;
 	knotif->error = resp.error;
 	knotif->val = resp.val;
+	knotif->flags = resp.flags;
 	complete(&knotif->ready);
 out:
 	mutex_unlock(&filter->notify_lock);
-- 
cgit v1.2.3


From da6043fe85eb5ec621e34a92540735dcebbea134 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Wed, 25 Sep 2019 15:39:12 +0100
Subject: PM / hibernate: memory_bm_find_bit(): Tighten node optimisation

When looking for a bit by number we make use of the cached result from the
preceding lookup to speed up operation.  Firstly we check if the requested
pfn is within the cached zone and if not lookup the new zone.  We then
check if the offset for that pfn falls within the existing cached node.
This happens regardless of whether the node is within the zone we are
now scanning.  With certain memory layouts it is possible for this to
false trigger creating a temporary alias for the pfn to a different bit.
This leads the hibernation code to free memory which it was never allocated
with the expected fallout.

Ensure the zone we are scanning matches the cached zone before considering
the cached node.

Deep thanks go to Andrea for many, many, many hours of hacking and testing
that went into cornering this bug.

Reported-by: Andrea Righi <andrea.righi@canonical.com>
Tested-by: Andrea Righi <andrea.righi@canonical.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 83105874f255..26b9168321e7 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -734,8 +734,15 @@ zone_found:
 	 * We have found the zone. Now walk the radix tree to find the leaf node
 	 * for our PFN.
 	 */
+
+	/*
+	 * If the zone we wish to scan is the the current zone and the
+	 * pfn falls into the current node then we do not need to walk
+	 * the tree.
+	 */
 	node = bm->cur.node;
-	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+	if (zone == bm->cur.zone &&
+	    ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
 		goto node_found;
 
 	node      = zone->rtree;
-- 
cgit v1.2.3


From ce4dd4429b3c7e4506870796f3b8b06d707d2928 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 16 Oct 2019 15:13:41 +0100
Subject: Remove the nr_exclusive argument from __wake_up_sync_key()

Remove the nr_exclusive argument from __wake_up_sync_key() and derived
functions as everything seems to set it to 1.  Note also that if it wasn't
set to 1, it would clear WF_SYNC anyway.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/exit.c       |  2 +-
 kernel/sched/wait.c | 14 ++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a46a50d67002..a1ff25ef050e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1435,7 +1435,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
 void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
 {
 	__wake_up_sync_key(&parent->signal->wait_chldexit,
-				TASK_INTERRUPTIBLE, 1, p);
+			   TASK_INTERRUPTIBLE, p);
 }
 
 static long do_wait(struct wait_opts *wo)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index c1e566a114ca..b4b52361dab7 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @wq_head: the waitqueue
  * @mode: which threads
- * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
@@ -183,26 +182,21 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark);
  * accessing the task state.
  */
 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
-			int nr_exclusive, void *key)
+			void *key)
 {
-	int wake_flags = 1; /* XXX WF_SYNC */
-
 	if (unlikely(!wq_head))
 		return;
 
-	if (unlikely(nr_exclusive != 1))
-		wake_flags = 0;
-
-	__wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key);
+	__wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
-void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive)
+void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode)
 {
-	__wake_up_sync_key(wq_head, mode, nr_exclusive, NULL);
+	__wake_up_sync_key(wq_head, mode, NULL);
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
-- 
cgit v1.2.3


From d07ce4e32a8d68062c58a3e635619313c52d0bf7 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Mon, 21 Oct 2019 11:10:56 +0100
Subject: kdb: Avoid array subscript warnings on non-SMP builds

Recent versions of gcc (reported on gcc-7.4) issue array subscript
warnings for builds where SMP is not enabled.

kernel/debug/debug_core.c: In function 'kdb_dump_stack_on_cpu':
kernel/debug/debug_core.c:452:17: warning: array subscript is outside array
+bounds [-Warray-bounds]
     if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) {
           ~~~~~~~~~^~~~~
   kernel/debug/debug_core.c:469:33: warning: array subscript is outside array
+bounds [-Warray-bounds]
     kgdb_info[cpu].exception_state |= DCPU_WANT_BT;
   kernel/debug/debug_core.c:470:18: warning: array subscript is outside array
+bounds [-Warray-bounds]
     while (kgdb_info[cpu].exception_state & DCPU_WANT_BT)

There is no bug here but there is scope to improve the code
generation for non-SMP systems (whilst also silencing the warning).

Reported-by: kbuild test robot <lkp@intel.com>
Fixes: 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master")
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20191021101057.23861-1-daniel.thompson@linaro.org
Reviewed-by: Douglas Anderson <dianders@chromium.org>
---
 kernel/debug/debug_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 70e86b4b4932..2b7c9b67931d 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -444,7 +444,7 @@ setundefined:
 #ifdef CONFIG_KGDB_KDB
 void kdb_dump_stack_on_cpu(int cpu)
 {
-	if (cpu == raw_smp_processor_id()) {
+	if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) {
 		dump_stack();
 		return;
 	}
-- 
cgit v1.2.3


From c34c78dfc1fc68a1f5403f996de8ca62f298d7b2 Mon Sep 17 00:00:00 2001
From: Yunfeng Ye <yeyunfeng@huawei.com>
Date: Wed, 23 Oct 2019 21:27:34 +0800
Subject: audit: remove redundant condition check in kauditd_thread()

Warning is found by the code analysis tool:
  "the condition 'if(ac && rc < 0)' is redundant: ac"

The @ac variable has been checked before. It can't be a null pointer
here, so remove the redundant condition check.

Signed-off-by: Yunfeng Ye <yeyunfeng@huawei.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index d75485aa25ff..8e09f0f55b4b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -830,7 +830,7 @@ static int kauditd_thread(void *dummy)
 		rc = kauditd_send_queue(sk, portid,
 					&audit_hold_queue, UNICAST_RETRIES,
 					NULL, kauditd_rehold_skb);
-		if (ac && rc < 0) {
+		if (rc < 0) {
 			sk = NULL;
 			auditd_reset(ac);
 			goto main_queue;
@@ -840,7 +840,7 @@ static int kauditd_thread(void *dummy)
 		rc = kauditd_send_queue(sk, portid,
 					&audit_retry_queue, UNICAST_RETRIES,
 					NULL, kauditd_hold_skb);
-		if (ac && rc < 0) {
+		if (rc < 0) {
 			sk = NULL;
 			auditd_reset(ac);
 			goto main_queue;
-- 
cgit v1.2.3


From 53b63136e81220cb2f8b541c03a1df9199896821 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 25 Oct 2019 08:33:24 +0100
Subject: kdb: Tidy up code to handle escape sequences

kdb_read_get_key() has extremely complex break/continue control flow
managed by state variables and is very hard to review or modify. In
particular the way the escape sequence handling interacts with the
general control flow is hard to follow. Separate out the escape key
handling, without changing the control flow. This makes the main body of
the code easier to review.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20191025073328.643-2-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_io.c | 128 ++++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 3a5184eb6977..cfc054fd8097 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -49,6 +49,65 @@ static int kgdb_transition_check(char *buffer)
 	return 0;
 }
 
+/**
+ * kdb_handle_escape() - validity check on an accumulated escape sequence.
+ * @buf:	Accumulated escape characters to be examined. Note that buf
+ *		is not a string, it is an array of characters and need not be
+ *		nil terminated.
+ * @sz:		Number of accumulated escape characters.
+ *
+ * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete,
+ * otherwise it returns a mapped key value to pass to the upper layers.
+ */
+static int kdb_handle_escape(char *buf, size_t sz)
+{
+	char *lastkey = buf + sz - 1;
+
+	switch (sz) {
+	case 1:
+		if (*lastkey == '\e')
+			return 0;
+		break;
+
+	case 2: /* \e<something> */
+		if (*lastkey == '[')
+			return 0;
+		break;
+
+	case 3:
+		switch (*lastkey) {
+		case 'A': /* \e[A, up arrow */
+			return 16;
+		case 'B': /* \e[B, down arrow */
+			return 14;
+		case 'C': /* \e[C, right arrow */
+			return 6;
+		case 'D': /* \e[D, left arrow */
+			return 2;
+		case '1': /* \e[<1,3,4>], may be home, del, end */
+		case '3':
+		case '4':
+			return 0;
+		}
+		break;
+
+	case 4:
+		if (*lastkey == '~') {
+			switch (buf[2]) {
+			case '1': /* \e[1~, home */
+				return 1;
+			case '3': /* \e[3~, del */
+				return 4;
+			case '4': /* \e[4~, end */
+				return 5;
+			}
+		}
+		break;
+	}
+
+	return -1;
+}
+
 static int kdb_read_get_key(char *buffer, size_t bufsize)
 {
 #define ESCAPE_UDELAY 1000
@@ -102,68 +161,15 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
 				escape_delay = 2;
 				continue;
 			}
-			if (ped - escape_data == 1) {
-				/* \e */
-				continue;
-			} else if (ped - escape_data == 2) {
-				/* \e<something> */
-				if (key != '[')
-					escape_delay = 2;
-				continue;
-			} else if (ped - escape_data == 3) {
-				/* \e[<something> */
-				int mapkey = 0;
-				switch (key) {
-				case 'A': /* \e[A, up arrow */
-					mapkey = 16;
-					break;
-				case 'B': /* \e[B, down arrow */
-					mapkey = 14;
-					break;
-				case 'C': /* \e[C, right arrow */
-					mapkey = 6;
-					break;
-				case 'D': /* \e[D, left arrow */
-					mapkey = 2;
-					break;
-				case '1': /* dropthrough */
-				case '3': /* dropthrough */
-				/* \e[<1,3,4>], may be home, del, end */
-				case '4':
-					mapkey = -1;
-					break;
-				}
-				if (mapkey != -1) {
-					if (mapkey > 0) {
-						escape_data[0] = mapkey;
-						escape_data[1] = '\0';
-					}
-					escape_delay = 2;
-				}
-				continue;
-			} else if (ped - escape_data == 4) {
-				/* \e[<1,3,4><something> */
-				int mapkey = 0;
-				if (key == '~') {
-					switch (escape_data[2]) {
-					case '1': /* \e[1~, home */
-						mapkey = 1;
-						break;
-					case '3': /* \e[3~, del */
-						mapkey = 4;
-						break;
-					case '4': /* \e[4~, end */
-						mapkey = 5;
-						break;
-					}
-				}
-				if (mapkey > 0) {
-					escape_data[0] = mapkey;
-					escape_data[1] = '\0';
-				}
-				escape_delay = 2;
-				continue;
+
+			key = kdb_handle_escape(escape_data, ped - escape_data);
+			if (key > 0) {
+				escape_data[0] = key;
+				escape_data[1] = '\0';
 			}
+			if (key)
+				escape_delay = 2;
+			continue;
 		}
 		break;	/* A key to process */
 	}
-- 
cgit v1.2.3


From d04213af90935d8b247c1327c9ea142fc037165f Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 25 Oct 2019 08:33:25 +0100
Subject: kdb: Simplify code to fetch characters from console

Currently kdb_read_get_key() contains complex control flow that, on
close inspection, turns out to be unnecessary. In particular:

1. It is impossible to enter the branch conditioned on (escape_delay == 1)
   except when the loop enters with (escape_delay == 2) allowing us to
   combine the branches.

2. Most of the code conditioned on (escape_delay == 2) simply modifies
   local data and then breaks out of the loop causing the function to
   return escape_data[0].

3. Based on #2 there is not actually any need to ever explicitly set
   escape_delay to 2 because we it is much simpler to directly return
   escape_data[0] instead.

4. escape_data[0] is, for all but one exit path, known to be '\e'.

Simplify the code based on these observations.

There is a subtle (and harmless) change of behaviour resulting from this
simplification: instead of letting the escape timeout after ~1998
milliseconds we now timeout after ~2000 milliseconds

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20191025073328.643-3-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_io.c | 38 ++++++++++++++------------------------
 1 file changed, 14 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index cfc054fd8097..a92ceca29637 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -124,25 +124,18 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
 			touch_nmi_watchdog();
 			f = &kdb_poll_funcs[0];
 		}
-		if (escape_delay == 2) {
-			*ped = '\0';
-			ped = escape_data;
-			--escape_delay;
-		}
-		if (escape_delay == 1) {
-			key = *ped++;
-			if (!*ped)
-				--escape_delay;
-			break;
-		}
+
 		key = (*f)();
+
 		if (key == -1) {
 			if (escape_delay) {
 				udelay(ESCAPE_UDELAY);
-				--escape_delay;
+				if (--escape_delay == 0)
+					return '\e';
 			}
 			continue;
 		}
+
 		if (bufsize <= 2) {
 			if (key == '\r')
 				key = '\n';
@@ -150,27 +143,24 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
 			*buffer = '\0';
 			return -1;
 		}
+
 		if (escape_delay == 0 && key == '\e') {
 			escape_delay = ESCAPE_DELAY;
 			ped = escape_data;
 			f_escape = f;
 		}
 		if (escape_delay) {
-			*ped++ = key;
-			if (f_escape != f) {
-				escape_delay = 2;
-				continue;
-			}
+			if (f_escape != f)
+				return '\e';
 
+			*ped++ = key;
 			key = kdb_handle_escape(escape_data, ped - escape_data);
-			if (key > 0) {
-				escape_data[0] = key;
-				escape_data[1] = '\0';
-			}
-			if (key)
-				escape_delay = 2;
-			continue;
+			if (key < 0)
+				return '\e';
+			if (key == 0)
+				continue;
 		}
+
 		break;	/* A key to process */
 	}
 	return key;
-- 
cgit v1.2.3


From 4f27e824bf83dfc2f6dc1a54fae419be7cd335af Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 25 Oct 2019 08:33:26 +0100
Subject: kdb: Remove special case logic from kdb_read()

kdb_read() contains special case logic to force it exit after reading
a single character. We can remove all the special case logic by directly
calling the function to read a single character instead. This also
allows us to tidy up the function prototype which, because it now matches
getchar(), we can also rename in order to make its role clearer.

This does involve some extra code to handle btaprompt properly but we
don't mind the new lines of code here because the old code had some
interesting problems (bad newline handling, treating unexpected
characters like <cr>).

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20191025073328.643-4-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_bt.c      | 22 +++++++++------
 kernel/debug/kdb/kdb_io.c      | 61 +++++++++++++++++++-----------------------
 kernel/debug/kdb/kdb_private.h |  1 +
 3 files changed, 42 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 0e94efe07b72..4af48ac53625 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -75,9 +75,10 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
 static int
 kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
 {
-	char buffer[2];
-	if (kdb_getarea(buffer[0], (unsigned long)p) ||
-	    kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
+	char ch;
+
+	if (kdb_getarea(ch, (unsigned long)p) ||
+	    kdb_getarea(ch, (unsigned long)(p+1)-1))
 		return KDB_BADADDR;
 	if (!kdb_task_state(p, mask))
 		return 0;
@@ -85,12 +86,17 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
 	kdb_ps1(p);
 	kdb_show_stack(p, NULL);
 	if (btaprompt) {
-		kdb_getstr(buffer, sizeof(buffer),
-			   "Enter <q> to end, <cr> to continue:");
-		if (buffer[0] == 'q') {
-			kdb_printf("\n");
+		kdb_printf("Enter <q> to end, <cr> or <space> to continue:");
+		do {
+			ch = kdb_getchar();
+		} while (!strchr("\r\n q", ch));
+		kdb_printf("\n");
+
+		/* reset the pager */
+		kdb_nextline = 1;
+
+		if (ch == 'q')
 			return 1;
-		}
 	}
 	touch_nmi_watchdog();
 	return 0;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index a92ceca29637..9b6933d585b5 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -108,7 +108,22 @@ static int kdb_handle_escape(char *buf, size_t sz)
 	return -1;
 }
 
-static int kdb_read_get_key(char *buffer, size_t bufsize)
+/**
+ * kdb_getchar() - Read a single character from a kdb console (or consoles).
+ *
+ * Other than polling the various consoles that are currently enabled,
+ * most of the work done in this function is dealing with escape sequences.
+ *
+ * An escape key could be the start of a vt100 control sequence such as \e[D
+ * (left arrow) or it could be a character in its own right.  The standard
+ * method for detecting the difference is to wait for 2 seconds to see if there
+ * are any other characters.  kdb is complicated by the lack of a timer service
+ * (interrupts are off), by multiple input sources. Escape sequence processing
+ * has to be done as states in the polling loop.
+ *
+ * Return: The key pressed or a control code derived from an escape sequence.
+ */
+char kdb_getchar(void)
 {
 #define ESCAPE_UDELAY 1000
 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
@@ -126,7 +141,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
 		}
 
 		key = (*f)();
-
 		if (key == -1) {
 			if (escape_delay) {
 				udelay(ESCAPE_UDELAY);
@@ -136,14 +150,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
 			continue;
 		}
 
-		if (bufsize <= 2) {
-			if (key == '\r')
-				key = '\n';
-			*buffer++ = key;
-			*buffer = '\0';
-			return -1;
-		}
-
 		if (escape_delay == 0 && key == '\e') {
 			escape_delay = ESCAPE_DELAY;
 			ped = escape_data;
@@ -184,17 +190,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize)
  *	function.  It is not reentrant - it relies on the fact
  *	that while kdb is running on only one "master debug" cpu.
  * Remarks:
- *
- * The buffer size must be >= 2.  A buffer size of 2 means that the caller only
- * wants a single key.
- *
- * An escape key could be the start of a vt100 control sequence such as \e[D
- * (left arrow) or it could be a character in its own right.  The standard
- * method for detecting the difference is to wait for 2 seconds to see if there
- * are any other characters.  kdb is complicated by the lack of a timer service
- * (interrupts are off), by multiple input sources and by the need to sometimes
- * return after just one key.  Escape sequence processing has to be done as
- * states in the polling loop.
+ *	The buffer size must be >= 2.
  */
 
 static char *kdb_read(char *buffer, size_t bufsize)
@@ -229,9 +225,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
 	*cp = '\0';
 	kdb_printf("%s", buffer);
 poll_again:
-	key = kdb_read_get_key(buffer, bufsize);
-	if (key == -1)
-		return buffer;
+	key = kdb_getchar();
 	if (key != 9)
 		tab = 0;
 	switch (key) {
@@ -742,7 +736,7 @@ kdb_printit:
 
 	/* check for having reached the LINES number of printed lines */
 	if (kdb_nextline >= linecount) {
-		char buf1[16] = "";
+		char ch;
 
 		/* Watch out for recursion here.  Any routine that calls
 		 * kdb_printf will come back through here.  And kdb_read
@@ -777,39 +771,38 @@ kdb_printit:
 		if (logging)
 			printk("%s", moreprompt);
 
-		kdb_read(buf1, 2); /* '2' indicates to return
-				    * immediately after getting one key. */
+		ch = kdb_getchar();
 		kdb_nextline = 1;	/* Really set output line 1 */
 
 		/* empty and reset the buffer: */
 		kdb_buffer[0] = '\0';
 		next_avail = kdb_buffer;
 		size_avail = sizeof(kdb_buffer);
-		if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
+		if ((ch == 'q') || (ch == 'Q')) {
 			/* user hit q or Q */
 			KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
 			KDB_STATE_CLEAR(PAGER);
 			/* end of command output; back to normal mode */
 			kdb_grepping_flag = 0;
 			kdb_printf("\n");
-		} else if (buf1[0] == ' ') {
+		} else if (ch == ' ') {
 			kdb_printf("\r");
 			suspend_grep = 1; /* for this recursion */
-		} else if (buf1[0] == '\n') {
+		} else if (ch == '\n' || ch == '\r') {
 			kdb_nextline = linecount - 1;
 			kdb_printf("\r");
 			suspend_grep = 1; /* for this recursion */
-		} else if (buf1[0] == '/' && !kdb_grepping_flag) {
+		} else if (ch == '/' && !kdb_grepping_flag) {
 			kdb_printf("\r");
 			kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN,
 				   kdbgetenv("SEARCHPROMPT") ?: "search> ");
 			*strchrnul(kdb_grep_string, '\n') = '\0';
 			kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH;
 			suspend_grep = 1; /* for this recursion */
-		} else if (buf1[0] && buf1[0] != '\n') {
-			/* user hit something other than enter */
+		} else if (ch) {
+			/* user hit something unexpected */
 			suspend_grep = 1; /* for this recursion */
-			if (buf1[0] != '/')
+			if (ch != '/')
 				kdb_printf(
 				    "\nOnly 'q', 'Q' or '/' are processed at "
 				    "more prompt, input ignored\n");
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 2118d8258b7c..55d052061ef9 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -210,6 +210,7 @@ extern void kdb_ps1(const struct task_struct *p);
 extern void kdb_print_nameval(const char *name, unsigned long val);
 extern void kdb_send_sig(struct task_struct *p, int sig);
 extern void kdb_meminfo_proc_show(void);
+extern char kdb_getchar(void);
 extern char *kdb_getstr(char *, size_t, const char *);
 extern void kdb_gdb_state_pass(char *buf);
 
-- 
cgit v1.2.3


From cdca8d8900dd33ce6b8b526e247d2a6009d05de0 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 25 Oct 2019 08:33:27 +0100
Subject: kdb: Improve handling of characters from different input sources

Currently if an escape timer is interrupted by a character from a
different input source then the new character is discarded and the
function returns '\e' (which will be discarded by the level above).
It is hard to see why this would ever be the desired behaviour.
Fix this to return the new character rather than the '\e'.

This is a bigger refactor than might be expected because the new
character needs to go through escape sequence detection.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20191025073328.643-5-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_io.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 9b6933d585b5..f794c0ca4557 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -127,10 +127,10 @@ char kdb_getchar(void)
 {
 #define ESCAPE_UDELAY 1000
 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
-	char escape_data[5];	/* longest vt100 escape sequence is 4 bytes */
-	char *ped = escape_data;
+	char buf[4];	/* longest vt100 escape sequence is 4 bytes */
+	char *pbuf = buf;
 	int escape_delay = 0;
-	get_char_func *f, *f_escape = NULL;
+	get_char_func *f, *f_prev = NULL;
 	int key;
 
 	for (f = &kdb_poll_funcs[0]; ; ++f) {
@@ -150,26 +150,26 @@ char kdb_getchar(void)
 			continue;
 		}
 
-		if (escape_delay == 0 && key == '\e') {
+		/*
+		 * When the first character is received (or we get a change
+		 * input source) we set ourselves up to handle an escape
+		 * sequences (just in case).
+		 */
+		if (f_prev != f) {
+			f_prev = f;
+			pbuf = buf;
 			escape_delay = ESCAPE_DELAY;
-			ped = escape_data;
-			f_escape = f;
-		}
-		if (escape_delay) {
-			if (f_escape != f)
-				return '\e';
-
-			*ped++ = key;
-			key = kdb_handle_escape(escape_data, ped - escape_data);
-			if (key < 0)
-				return '\e';
-			if (key == 0)
-				continue;
 		}
 
-		break;	/* A key to process */
+		*pbuf++ = key;
+		key = kdb_handle_escape(buf, pbuf - buf);
+		if (key < 0) /* no escape sequence; return first character */
+			return buf[0];
+		if (key > 0)
+			return key;
 	}
-	return key;
+
+	unreachable();
 }
 
 /*
-- 
cgit v1.2.3


From c58ff643763c78bef12874ee39995c9f7f987bc2 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Fri, 25 Oct 2019 08:33:28 +0100
Subject: kdb: Tweak escape handling for vi users

Currently if sequences such as "\ehelp\r" are delivered to the console then
the h gets eaten by the escape handling code. Since pressing escape
becomes something of a nervous twitch for vi users (and that escape doesn't
have much effect at a shell prompt) it is more helpful to emit the 'h' than
the '\e'.

We don't simply choose to emit the final character for all escape sequences
since that will do odd things for unsupported escape sequences (in
other words we retain the existing behaviour once we see '\e[').

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Link: https://lore.kernel.org/r/20191025073328.643-6-daniel.thompson@linaro.org
---
 kernel/debug/kdb/kdb_io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index f794c0ca4557..8bcdded5d61f 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -163,8 +163,8 @@ char kdb_getchar(void)
 
 		*pbuf++ = key;
 		key = kdb_handle_escape(buf, pbuf - buf);
-		if (key < 0) /* no escape sequence; return first character */
-			return buf[0];
+		if (key < 0) /* no escape sequence; return best character */
+			return buf[pbuf - buf == 2 ? 1 : 0];
 		if (key > 0)
 			return key;
 	}
-- 
cgit v1.2.3


From a445e940ea686fc60475564009821010eb213be3 Mon Sep 17 00:00:00 2001
From: Vladimir Murzin <vladimir.murzin@arm.com>
Date: Wed, 30 Oct 2019 10:13:13 +0000
Subject: dma-mapping: fix handling of dma-ranges for reserved memory (again)

Daniele reported that issue previously fixed in c41f9ea998f3
("drivers: dma-coherent: Account dma_pfn_offset when used with device
tree") reappear shortly after 43fc509c3efb ("dma-coherent: introduce
interface for default DMA pool") where fix was accidentally dropped.

Lets put fix back in place and respect dma-ranges for reserved memory.

Fixes: 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool")

Reported-by: Daniele Alessandrelli <daniele.alessandrelli@gmail.com>
Tested-by: Daniele Alessandrelli <daniele.alessandrelli@gmail.com>
Tested-by: Alexandre Torgue <alexandre.torgue@st.com>
Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/coherent.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 545e3869b0e3..551b0eb7028a 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -123,8 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 	return ret;
 }
 
-static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
-		ssize_t size, dma_addr_t *dma_handle)
+static void *__dma_alloc_from_coherent(struct device *dev,
+				       struct dma_coherent_mem *mem,
+				       ssize_t size, dma_addr_t *dma_handle)
 {
 	int order = get_order(size);
 	unsigned long flags;
@@ -143,7 +144,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
 	/*
 	 * Memory was found in the coherent area.
 	 */
-	*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+	*dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT);
 	ret = mem->virt_base + (pageno << PAGE_SHIFT);
 	spin_unlock_irqrestore(&mem->spinlock, flags);
 	memset(ret, 0, size);
@@ -175,17 +176,18 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
 	if (!mem)
 		return 0;
 
-	*ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+	*ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle);
 	return 1;
 }
 
-void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
+void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size,
+				     dma_addr_t *dma_handle)
 {
 	if (!dma_coherent_default_memory)
 		return NULL;
 
-	return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
-			dma_handle);
+	return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size,
+					 dma_handle);
 }
 
 static int __dma_release_from_coherent(struct dma_coherent_mem *mem,
-- 
cgit v1.2.3


From 9ff6aa027dbb98755f0265695354f2dd07c0d1ce Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 28 Oct 2019 14:56:46 -0700
Subject: dma-debug: add a schedule point in debug_dma_dump_mappings()

debug_dma_dump_mappings() can take a lot of cpu cycles :

lpk43:/# time wc -l /sys/kernel/debug/dma-api/dump
163435 /sys/kernel/debug/dma-api/dump

real	0m0.463s
user	0m0.003s
sys	0m0.459s

Let's add a cond_resched() to avoid holding cpu for too long.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Corentin Labbe <clabbe@baylibre.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 099002d84f46..4ad74f5987ea 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -420,6 +420,7 @@ void debug_dma_dump_mappings(struct device *dev)
 		}
 
 		spin_unlock_irqrestore(&bucket->lock, flags);
+		cond_resched();
 	}
 }
 
-- 
cgit v1.2.3


From ca66536845cd55c6a5fccd82694dcc87ed970780 Mon Sep 17 00:00:00 2001
From: Shyam Saini <mayhs11saini@gmail.com>
Date: Sun, 20 Oct 2019 10:33:22 +0530
Subject: kernel: dma-contiguous: mark CMA parameters __initdata/__initconst

These parameters are only referenced by __init routine calls during
early boot so they should be marked as __initdata and __initconst
accordingly.

Signed-off-by: Shyam Saini <mayhs11saini@gmail.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/contiguous.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 69cfb4345388..daa4e6eefdde 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -42,10 +42,11 @@ struct cma *dma_contiguous_default_area;
  * Users, who want to set the size of global CMA area for their system
  * should use cma= kernel parameter.
  */
-static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
-static phys_addr_t size_cmdline = -1;
-static phys_addr_t base_cmdline;
-static phys_addr_t limit_cmdline;
+static const phys_addr_t size_bytes __initconst =
+	(phys_addr_t)CMA_SIZE_MBYTES * SZ_1M;
+static phys_addr_t  size_cmdline __initdata = -1;
+static phys_addr_t base_cmdline __initdata;
+static phys_addr_t limit_cmdline __initdata;
 
 static int __init early_cma(char *p)
 {
-- 
cgit v1.2.3


From f94df9890e98f2090c6a8d70c795134863b70201 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 24 Sep 2019 16:07:45 +0100
Subject: Add wake_up_interruptible_sync_poll_locked()

Add a wakeup call for a case whereby the caller already has the waitqueue
spinlock held.  This can be used by pipes to alter the ring buffer indices
and issue a wakeup under the same spinlock.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/wait.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index b4b52361dab7..ba059fbfc53a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -191,6 +191,29 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode,
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
 
+/**
+ * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue.
+ * @wq_head: the waitqueue
+ * @mode: which threads
+ * @key: opaque value to be passed to wakeup targets
+ *
+ * The sync wakeup differs in that the waker knows that it will schedule
+ * away soon, so while the target thread will be woken up, it will not
+ * be migrated to another CPU - ie. the two threads are 'synchronized'
+ * with each other. This can prevent needless bouncing between CPUs.
+ *
+ * On UP it can prevent extra preemption.
+ *
+ * If this function wakes up a task, it executes a full memory barrier before
+ * accessing the task state.
+ */
+void __wake_up_locked_sync_key(struct wait_queue_head *wq_head,
+			       unsigned int mode, void *key)
+{
+        __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL);
+}
+EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key);
+
 /*
  * __wake_up_sync - see __wake_up_sync_key()
  */
-- 
cgit v1.2.3


From f973cce0e4022fa8c969ca5b0c71559125382eb2 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Mon, 21 Oct 2019 22:38:29 +0200
Subject: kexec: Fix pointer-to-int-cast warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix two pointer-to-int-cast warnings when compiling for the 32-bit parisc
platform:

kernel/kexec_file.c: In function ‘crash_prepare_elf64_headers’:
kernel/kexec_file.c:1307:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
  phdr->p_vaddr = (Elf64_Addr)_text;
                  ^
kernel/kexec_file.c:1324:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
  phdr->p_vaddr = (unsigned long long) __va(mstart);
                  ^

Signed-off-by: Helge Deller <deller@gmx.de>
---
 kernel/kexec_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 79f252af7dee..a2df93948665 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1304,7 +1304,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 	if (kernel_map) {
 		phdr->p_type = PT_LOAD;
 		phdr->p_flags = PF_R|PF_W|PF_X;
-		phdr->p_vaddr = (Elf64_Addr)_text;
+		phdr->p_vaddr = (unsigned long) _text;
 		phdr->p_filesz = phdr->p_memsz = _end - _text;
 		phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
 		ehdr->e_phnum++;
@@ -1321,7 +1321,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map,
 		phdr->p_offset  = mstart;
 
 		phdr->p_paddr = mstart;
-		phdr->p_vaddr = (unsigned long long) __va(mstart);
+		phdr->p_vaddr = (unsigned long) __va(mstart);
 		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
 		phdr->p_align = 0;
 		ehdr->e_phnum++;
-- 
cgit v1.2.3


From 7162431dcf72032835d369c8d7b51311df407938 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Wed, 16 Oct 2019 13:33:13 +0200
Subject: ftrace: Introduce PERMANENT ftrace_ops flag

Livepatch uses ftrace for redirection to new patched functions. It means
that if ftrace is disabled, all live patched functions are disabled as
well. Toggling global 'ftrace_enabled' sysctl thus affect it directly.
It is not a problem per se, because only administrator can set sysctl
values, but it still may be surprising.

Introduce PERMANENT ftrace_ops flag to amend this. If the
FTRACE_OPS_FL_PERMANENT is set on any ftrace ops, the tracing cannot be
disabled by disabling ftrace_enabled. Equally, a callback with the flag
set cannot be registered if ftrace_enabled is disabled.

Link: http://lkml.kernel.org/r/20191016113316.13415-2-mbenes@suse.cz

Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/livepatch/patch.c |  3 ++-
 kernel/trace/ftrace.c    | 23 +++++++++++++++++++++--
 2 files changed, 23 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index bd43537702bd..b552cf2d85f8 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func)
 		ops->fops.func = klp_ftrace_handler;
 		ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
 				  FTRACE_OPS_FL_DYNAMIC |
-				  FTRACE_OPS_FL_IPMODIFY;
+				  FTRACE_OPS_FL_IPMODIFY |
+				  FTRACE_OPS_FL_PERMANENT;
 
 		list_add(&ops->node, &klp_ops);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f296d89be757..89e9128652ef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops)
 	if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
 		ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
 #endif
+	if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
+		return -EBUSY;
 
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
@@ -6754,6 +6756,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 }
 EXPORT_SYMBOL_GPL(unregister_ftrace_function);
 
+static bool is_permanent_ops_registered(void)
+{
+	struct ftrace_ops *op;
+
+	do_for_each_ftrace_op(op, ftrace_ops_list) {
+		if (op->flags & FTRACE_OPS_FL_PERMANENT)
+			return true;
+	} while_for_each_ftrace_op(op);
+
+	return false;
+}
+
 int
 ftrace_enable_sysctl(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp,
@@ -6771,8 +6785,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
 		goto out;
 
-	last_ftrace_enabled = !!ftrace_enabled;
-
 	if (ftrace_enabled) {
 
 		/* we are starting ftrace again */
@@ -6783,12 +6795,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 		ftrace_startup_sysctl();
 
 	} else {
+		if (is_permanent_ops_registered()) {
+			ftrace_enabled = true;
+			ret = -EBUSY;
+			goto out;
+		}
+
 		/* stopping ftrace calls (just send to ftrace_stub) */
 		ftrace_trace_function = ftrace_stub;
 
 		ftrace_shutdown_sysctl();
 	}
 
+	last_ftrace_enabled = !!ftrace_enabled;
  out:
 	mutex_unlock(&ftrace_lock);
 	return ret;
-- 
cgit v1.2.3


From 56144737e67329c9aaed15f942d46a6302e2e3d8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 6 Nov 2019 09:48:04 -0800
Subject: hrtimer: Annotate lockless access to timer->state

syzbot reported various data-race caused by hrtimer_is_queued() reading
timer->state. A READ_ONCE() is required there to silence the warning.

Also add the corresponding WRITE_ONCE() when timer->state is set.

In remove_hrtimer() the hrtimer_is_queued() helper is open coded to avoid
loading timer->state twice.

KCSAN reported these cases:

BUG: KCSAN: data-race in __remove_hrtimer / tcp_pacing_check

write to 0xffff8880b2a7d388 of 1 bytes by interrupt on cpu 0:
 __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991
 __run_hrtimer kernel/time/hrtimer.c:1496 [inline]
 __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576
 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 run_ksoftirqd+0x46/0x60 kernel/softirq.c:603
 smpboot_thread_fn+0x37d/0x4a0 kernel/smpboot.c:165
 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352

read to 0xffff8880b2a7d388 of 1 bytes by task 24652 on cpu 1:
 tcp_pacing_check net/ipv4/tcp_output.c:2235 [inline]
 tcp_pacing_check+0xba/0x130 net/ipv4/tcp_output.c:2225
 tcp_xmit_retransmit_queue+0x32c/0x5a0 net/ipv4/tcp_output.c:3044
 tcp_xmit_recovery+0x7c/0x120 net/ipv4/tcp_input.c:3558
 tcp_ack+0x17b6/0x3170 net/ipv4/tcp_input.c:3717
 tcp_rcv_established+0x37e/0xf50 net/ipv4/tcp_input.c:5696
 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561
 sk_backlog_rcv include/net/sock.h:945 [inline]
 __release_sock+0x135/0x1e0 net/core/sock.c:2435
 release_sock+0x61/0x160 net/core/sock.c:2951
 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145
 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393
 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434
 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0x9f/0xc0 net/socket.c:657

BUG: KCSAN: data-race in __remove_hrtimer / __tcp_ack_snd_check

write to 0xffff8880a3a65588 of 1 bytes by interrupt on cpu 0:
 __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991
 __run_hrtimer kernel/time/hrtimer.c:1496 [inline]
 __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576
 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593
 __do_softirq+0x115/0x33f kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0xbb/0xe0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830

read to 0xffff8880a3a65588 of 1 bytes by task 22891 on cpu 1:
 __tcp_ack_snd_check+0x415/0x4f0 net/ipv4/tcp_input.c:5265
 tcp_ack_snd_check net/ipv4/tcp_input.c:5287 [inline]
 tcp_rcv_established+0x750/0xf50 net/ipv4/tcp_input.c:5708
 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561
 sk_backlog_rcv include/net/sock.h:945 [inline]
 __release_sock+0x135/0x1e0 net/core/sock.c:2435
 release_sock+0x61/0x160 net/core/sock.c:2951
 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145
 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393
 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434
 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0x9f/0xc0 net/socket.c:657
 __sys_sendto+0x21f/0x320 net/socket.c:1952
 __do_sys_sendto net/socket.c:1964 [inline]
 __se_sys_sendto net/socket.c:1960 [inline]
 __x64_sys_sendto+0x89/0xb0 net/socket.c:1960
 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290

Reported by Kernel Concurrency Sanitizer on:
CPU: 1 PID: 24652 Comm: syz-executor.3 Not tainted 5.4.0-rc3+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011

[ tglx: Added comments ]

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191106174804.74723-1-edumazet@google.com
---
 kernel/time/hrtimer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 65605530ee34..7f31932216a1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -966,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	timer->state = HRTIMER_STATE_ENQUEUED;
+	/* Pairs with the lockless read in hrtimer_is_queued() */
+	WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -988,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	u8 state = timer->state;
 
-	timer->state = newstate;
+	/* Pairs with the lockless read in hrtimer_is_queued() */
+	WRITE_ONCE(timer->state, newstate);
 	if (!(state & HRTIMER_STATE_ENQUEUED))
 		return;
 
@@ -1013,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
 static inline int
 remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
-	if (hrtimer_is_queued(timer)) {
-		u8 state = timer->state;
+	u8 state = timer->state;
+
+	if (state & HRTIMER_STATE_ENQUEUED) {
 		int reprogram;
 
 		/*
-- 
cgit v1.2.3


From acaade1af3587132e7ea585f470a95261e14f60c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 09:57:09 +0100
Subject: dma-direct: remove __dma_direct_free_pages

We can just call dma_free_contiguous directly instead of wrapping it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
---
 kernel/dma/direct.c | 11 +++--------
 kernel/dma/remap.c  |  4 ++--
 2 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 8402b29c280f..a7a2739fb747 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -153,7 +153,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		 * so log an error and fail.
 		 */
 		dev_info(dev, "Rejecting highmem page from CMA.\n");
-		__dma_direct_free_pages(dev, size, page);
+		dma_free_contiguous(dev, page, size);
 		return NULL;
 	}
 
@@ -175,11 +175,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	return ret;
 }
 
-void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page)
-{
-	dma_free_contiguous(dev, page, size);
-}
-
 void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_addr, unsigned long attrs)
 {
@@ -188,7 +183,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 	if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
 	    !force_dma_unencrypted(dev)) {
 		/* cpu_addr is a struct page cookie, not a kernel address */
-		__dma_direct_free_pages(dev, size, cpu_addr);
+		dma_free_contiguous(dev, cpu_addr, size);
 		return;
 	}
 
@@ -198,7 +193,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		cpu_addr = cached_kernel_address(cpu_addr);
-	__dma_direct_free_pages(dev, size, virt_to_page(cpu_addr));
+	dma_free_contiguous(dev, virt_to_page(cpu_addr), size);
 }
 
 void *dma_direct_alloc(struct device *dev, size_t size,
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index c00b9258fa6a..fb1e50c2d48a 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -238,7 +238,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 			dma_pgprot(dev, PAGE_KERNEL, attrs),
 			__builtin_return_address(0));
 	if (!ret) {
-		__dma_direct_free_pages(dev, size, page);
+		dma_free_contiguous(dev, page, size);
 		return ret;
 	}
 
@@ -256,7 +256,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		struct page *page = pfn_to_page(__phys_to_pfn(phys));
 
 		vunmap(vaddr);
-		__dma_direct_free_pages(dev, size, page);
+		dma_free_contiguous(dev, page, size);
 	}
 }
 
-- 
cgit v1.2.3


From 4e1003aa56a7d60ddb048e43a7a51368fcfe36af Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 09:57:32 +0100
Subject: dma-direct: remove the dma_handle argument to
 __dma_direct_alloc_pages

The argument isn't used anywhere, so stop passing it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
---
 kernel/dma/direct.c | 4 ++--
 kernel/dma/remap.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a7a2739fb747..724c282dd943 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -83,7 +83,7 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 }
 
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
+		gfp_t gfp, unsigned long attrs)
 {
 	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
@@ -131,7 +131,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
-	page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
+	page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
 	if (!page)
 		return NULL;
 
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index fb1e50c2d48a..90d5ce77c189 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -226,7 +226,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		goto done;
 	}
 
-	page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs);
+	page = __dma_direct_alloc_pages(dev, size, flags, attrs);
 	if (!page)
 		return NULL;
 
-- 
cgit v1.2.3


From 714641c3670cdc75371a7ff5bdfd5e9a170c7ffd Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 12:25:46 -0500
Subject: ftrace: Separate out the copying of a ftrace_hash from
 __ftrace_hash_move()

Most of the functionality of __ftrace_hash_move() can be reused, but not all
of it. That is, __ftrace_hash_move() is used to simply make a new hash from
an existing one, using the same size as the original. Creating a dup_hash(),
where we can specify a new size will be useful when we want to create a hash
with a default size, or simply copy the old one.

Signed-off-by: Steven Rostedt (VMWare) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 89e9128652ef..76e5de8c7822 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1372,23 +1372,15 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
 static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
 				       struct ftrace_hash *new_hash);
 
-static struct ftrace_hash *
-__ftrace_hash_move(struct ftrace_hash *src)
+static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size)
 {
 	struct ftrace_func_entry *entry;
-	struct hlist_node *tn;
-	struct hlist_head *hhd;
 	struct ftrace_hash *new_hash;
-	int size = src->count;
+	struct hlist_head *hhd;
+	struct hlist_node *tn;
 	int bits = 0;
 	int i;
 
-	/*
-	 * If the new source is empty, just return the empty_hash.
-	 */
-	if (ftrace_hash_empty(src))
-		return EMPTY_HASH;
-
 	/*
 	 * Make the hash size about 1/2 the # found
 	 */
@@ -1413,10 +1405,23 @@ __ftrace_hash_move(struct ftrace_hash *src)
 			__add_hash_entry(new_hash, entry);
 		}
 	}
-
 	return new_hash;
 }
 
+static struct ftrace_hash *
+__ftrace_hash_move(struct ftrace_hash *src)
+{
+	int size = src->count;
+
+	/*
+	 * If the new source is empty, just return the empty_hash.
+	 */
+	if (ftrace_hash_empty(src))
+		return EMPTY_HASH;
+
+	return dup_hash(src, size);
+}
+
 static int
 ftrace_hash_move(struct ftrace_ops *ops, int enable,
 		 struct ftrace_hash **dst, struct ftrace_hash *src)
-- 
cgit v1.2.3


From 7e16f581a81759bafea04d049134b32d1a881226 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 12:26:46 -0500
Subject: ftrace: Separate out functionality from ftrace_location_range()

Create a new function called lookup_rec() from the functionality of
ftrace_location_range(). The difference between lookup_rec() is that it
returns the record that it finds, where as ftrace_location_range() returns
only if it found a match or not.

The lookup_rec() is static, and can be used for new functionality where
ftrace needs to find a record of a specific address.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76e5de8c7822..b0e7f03919de 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1541,6 +1541,26 @@ static int ftrace_cmp_recs(const void *a, const void *b)
 	return 0;
 }
 
+static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
+{
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec = NULL;
+	struct dyn_ftrace key;
+
+	key.ip = start;
+	key.flags = end;	/* overload flags, as it is unsigned long */
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		if (end < pg->records[0].ip ||
+		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
+			continue;
+		rec = bsearch(&key, pg->records, pg->index,
+			      sizeof(struct dyn_ftrace),
+			      ftrace_cmp_recs);
+	}
+	return rec;
+}
+
 /**
  * ftrace_location_range - return the first address of a traced location
  *	if it touches the given ip range
@@ -1555,23 +1575,11 @@ static int ftrace_cmp_recs(const void *a, const void *b)
  */
 unsigned long ftrace_location_range(unsigned long start, unsigned long end)
 {
-	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
-	struct dyn_ftrace key;
 
-	key.ip = start;
-	key.flags = end;	/* overload flags, as it is unsigned long */
-
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		if (end < pg->records[0].ip ||
-		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
-			continue;
-		rec = bsearch(&key, pg->records, pg->index,
-			      sizeof(struct dyn_ftrace),
-			      ftrace_cmp_recs);
-		if (rec)
-			return rec->ip;
-	}
+	rec = lookup_rec(start, end);
+	if (rec)
+		return rec->ip;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 153bedbac2ebd475e1c7c2d2fa0c042f5525927d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Nov 2019 17:08:55 +0100
Subject: irq_work: Convert flags to atomic_t

We need to convert flags to atomic_t in order to later fix an ordering
issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or().

Also clarify the nature of those flags.

[ mingo: Converted two more usage site the original patch missed. ]

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191108160858.31665-2-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/bpf/stackmap.c    |  2 +-
 kernel/irq_work.c        | 18 +++++++++---------
 kernel/printk/printk.c   |  2 +-
 kernel/trace/bpf_trace.c |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..4d31284095e2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 
 	if (in_nmi()) {
 		work = this_cpu_ptr(&up_read_work);
-		if (work->irq_work.flags & IRQ_WORK_BUSY)
+		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
 			/* cannot queue more up_read, fallback */
 			irq_work_busy = true;
 	}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index d42acaf81886..df0dbf4d859b 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
  */
 static bool irq_work_claim(struct irq_work *work)
 {
-	unsigned long flags, oflags, nflags;
+	int flags, oflags, nflags;
 
 	/*
 	 * Start with our best wish as a premise but only trust any
 	 * flag value after cmpxchg() result.
 	 */
-	flags = work->flags & ~IRQ_WORK_PENDING;
+	flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
 	for (;;) {
 		nflags = flags | IRQ_WORK_CLAIMED;
-		oflags = cmpxchg(&work->flags, flags, nflags);
+		oflags = atomic_cmpxchg(&work->flags, flags, nflags);
 		if (oflags == flags)
 			break;
 		if (oflags & IRQ_WORK_PENDING)
@@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void)
 static void __irq_work_queue_local(struct irq_work *work)
 {
 	/* If the work is "lazy", handle it from next tick if any */
-	if (work->flags & IRQ_WORK_LAZY) {
+	if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
 		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
 		    tick_nohz_tick_stopped())
 			arch_irq_work_raise();
@@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
 {
 	struct irq_work *work, *tmp;
 	struct llist_node *llnode;
-	unsigned long flags;
+	int flags;
 
 	BUG_ON(!irqs_disabled());
 
@@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list)
 		 * to claim that work don't rely on us to handle their data
 		 * while we are in the middle of the func.
 		 */
-		flags = work->flags & ~IRQ_WORK_PENDING;
-		xchg(&work->flags, flags);
+		flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
+		atomic_xchg(&work->flags, flags);
 
 		work->func(work);
 		/*
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
-		(void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+		(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
 	}
 }
 
@@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work)
 {
 	lockdep_assert_irqs_enabled();
 
-	while (work->flags & IRQ_WORK_BUSY)
+	while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
 		cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ca65327a6de8..865727373a3b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 	.func = wake_up_klogd_work_func,
-	.flags = IRQ_WORK_LAZY,
+	.flags = ATOMIC_INIT(IRQ_WORK_LAZY),
 };
 
 void wake_up_klogd(void)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 44bd08f2443b..ff467a4e2639 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig)
 			return -EINVAL;
 
 		work = this_cpu_ptr(&send_signal_work);
-		if (work->irq_work.flags & IRQ_WORK_BUSY)
+		if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
 			return -EBUSY;
 
 		/* Add the current task, which is the target of sending signal,
-- 
cgit v1.2.3


From 25269871db1ad0cbbaafd5098cbdb40c8db4ccb9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Nov 2019 17:08:56 +0100
Subject: irq_work: Fix irq_work_claim() memory ordering

When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just
return and don't raise a new IPI. We expect the destination to see
and handle our latest updades thanks to the pairing atomic_xchg()
in irq_work_run_list().

But cmpxchg() doesn't guarantee a full memory barrier upon failure. So
it's possible that the destination misses our latest updates.

So use atomic_fetch_or() instead that is unconditionally fully ordered
and also performs exactly what we want here and simplify the code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191108160858.31665-3-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq_work.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index df0dbf4d859b..255454a48346 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list);
  */
 static bool irq_work_claim(struct irq_work *work)
 {
-	int flags, oflags, nflags;
+	int oflags;
 
+	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
 	/*
-	 * Start with our best wish as a premise but only trust any
-	 * flag value after cmpxchg() result.
+	 * If the work is already pending, no need to raise the IPI.
+	 * The pairing atomic_xchg() in irq_work_run() makes sure
+	 * everything we did before is visible.
 	 */
-	flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
-	for (;;) {
-		nflags = flags | IRQ_WORK_CLAIMED;
-		oflags = atomic_cmpxchg(&work->flags, flags, nflags);
-		if (oflags == flags)
-			break;
-		if (oflags & IRQ_WORK_PENDING)
-			return false;
-		flags = oflags;
-		cpu_relax();
-	}
-
+	if (oflags & IRQ_WORK_PENDING)
+		return false;
 	return true;
 }
 
-- 
cgit v1.2.3


From feb4a51323babe13315c3b783ea7f1cf25368918 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 8 Nov 2019 17:08:57 +0100
Subject: irq_work: Slightly simplify IRQ_WORK_PENDING clearing

Instead of fetching the value of flags and perform an xchg() to clear
a bit, just use atomic_fetch_andnot() that is more suitable to do that
job in one operation while keeping the full ordering.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191108160858.31665-4-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq_work.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 255454a48346..49c53f80a13a 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work)
 	oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags);
 	/*
 	 * If the work is already pending, no need to raise the IPI.
-	 * The pairing atomic_xchg() in irq_work_run() makes sure
+	 * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
 	 * everything we did before is visible.
 	 */
 	if (oflags & IRQ_WORK_PENDING)
@@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list)
 {
 	struct irq_work *work, *tmp;
 	struct llist_node *llnode;
-	int flags;
 
 	BUG_ON(!irqs_disabled());
 
@@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list)
 
 	llnode = llist_del_all(list);
 	llist_for_each_entry_safe(work, tmp, llnode, llnode) {
+		int flags;
 		/*
 		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
@@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list)
 		 * to claim that work don't rely on us to handle their data
 		 * while we are in the middle of the func.
 		 */
-		flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING;
-		atomic_xchg(&work->flags, flags);
+		flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
 
 		work->func(work);
 		/*
-- 
cgit v1.2.3


From 34dc0ea6bc960f1f57b2148f01a3f4da23f87013 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 11:01:37 +0100
Subject: dma-direct: provide mmap and get_sgtable method overrides

For dma-direct we know that the DMA address is an encoding of the
physical address that we can trivially decode.  Use that fact to
provide implementations that do not need the arch_dma_coherent_to_pfn
architecture hook.  Note that we still can only support mmap of
non-coherent memory only if the architecture provides a way to set an
uncached bit in the page tables.  This must be true for architectures
that use the generic remap helpers, but other architectures can also
manually select it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
---
 kernel/dma/Kconfig   | 12 ++++++++---
 kernel/dma/direct.c  | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/dma/mapping.c | 45 +++++++--------------------------------
 kernel/dma/remap.c   |  6 ------
 4 files changed, 75 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 73c5c2b8e824..4c103a24e380 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -51,9 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
 config ARCH_HAS_DMA_PREP_COHERENT
 	bool
 
-config ARCH_HAS_DMA_COHERENT_TO_PFN
-	bool
-
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
 	bool
 
@@ -68,9 +65,18 @@ config SWIOTLB
 	bool
 	select NEED_DMA_MAP_STATE
 
+#
+# Should be selected if we can mmap non-coherent mappings to userspace.
+# The only thing that is really required is a way to set an uncached bit
+# in the pagetables
+#
+config DMA_NONCOHERENT_MMAP
+	bool
+
 config DMA_REMAP
 	depends on MMU
 	select GENERIC_ALLOCATOR
+	select DMA_NONCOHERENT_MMAP
 	bool
 
 config DMA_DIRECT_REMAP
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 724c282dd943..58beaa9ddd27 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -43,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev,
 	return phys_to_dma(dev, phys);
 }
 
+static inline struct page *dma_direct_to_page(struct device *dev,
+		dma_addr_t dma_addr)
+{
+	return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr)));
+}
+
 u64 dma_direct_get_required_mask(struct device *dev)
 {
 	u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
@@ -379,6 +385,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 }
 EXPORT_SYMBOL(dma_direct_map_resource);
 
+int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	struct page *page = dma_direct_to_page(dev, dma_addr);
+	int ret;
+
+	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (!ret)
+		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	return ret;
+}
+
+#ifdef CONFIG_MMU
+bool dma_direct_can_mmap(struct device *dev)
+{
+	return dev_is_dma_coherent(dev) ||
+		IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP);
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	unsigned long user_count = vma_pages(vma);
+	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr));
+	int ret = -ENXIO;
+
+	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+	if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff)
+		return -ENXIO;
+	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+			user_count << PAGE_SHIFT, vma->vm_page_prot);
+}
+#else /* CONFIG_MMU */
+bool dma_direct_can_mmap(struct device *dev)
+{
+	return false;
+}
+
+int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	return -ENXIO;
+}
+#endif /* CONFIG_MMU */
+
 /*
  * Because 32-bit DMA masks are so common we expect every architecture to be
  * able to satisfy them - either by not supporting more physical memory, or by
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index d9334f31a5af..12ff766ec1fa 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -112,24 +112,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
 		 void *cpu_addr, dma_addr_t dma_addr, size_t size,
 		 unsigned long attrs)
 {
-	struct page *page;
+	struct page *page = virt_to_page(cpu_addr);
 	int ret;
 
-	if (!dev_is_dma_coherent(dev)) {
-		unsigned long pfn;
-
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
-			return -ENXIO;
-
-		/* If the PFN is not valid, we do not have a struct page */
-		pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
-		if (!pfn_valid(pfn))
-			return -ENXIO;
-		page = pfn_to_page(pfn);
-	} else {
-		page = virt_to_page(cpu_addr);
-	}
-
 	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
 	if (!ret)
 		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
@@ -154,7 +139,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (dma_is_direct(ops))
-		return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr,
+		return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr,
 				size, attrs);
 	if (!ops->get_sgtable)
 		return -ENXIO;
@@ -192,7 +177,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	unsigned long user_count = vma_pages(vma);
 	unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT;
 	unsigned long off = vma->vm_pgoff;
-	unsigned long pfn;
 	int ret = -ENXIO;
 
 	vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
@@ -203,19 +187,8 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
 	if (off >= count || user_count > count - off)
 		return -ENXIO;
 
-	if (!dev_is_dma_coherent(dev)) {
-		if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
-			return -ENXIO;
-
-		/* If the PFN is not valid, we do not have a struct page */
-		pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
-		if (!pfn_valid(pfn))
-			return -ENXIO;
-	} else {
-		pfn = page_to_pfn(virt_to_page(cpu_addr));
-	}
-
-	return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+	return remap_pfn_range(vma, vma->vm_start,
+			page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff,
 			user_count << PAGE_SHIFT, vma->vm_page_prot);
 #else
 	return -ENXIO;
@@ -233,12 +206,8 @@ bool dma_can_mmap(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
-	if (dma_is_direct(ops)) {
-		return IS_ENABLED(CONFIG_MMU) &&
-		       (dev_is_dma_coherent(dev) ||
-			IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN));
-	}
-
+	if (dma_is_direct(ops))
+		return dma_direct_can_mmap(dev);
 	return ops->mmap != NULL;
 }
 EXPORT_SYMBOL_GPL(dma_can_mmap);
@@ -263,7 +232,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (dma_is_direct(ops))
-		return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size,
+		return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size,
 				attrs);
 	if (!ops->mmap)
 		return -ENXIO;
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 90d5ce77c189..3c49499ee6b0 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -259,10 +259,4 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr,
 		dma_free_contiguous(dev, page, size);
 	}
 }
-
-long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr,
-		dma_addr_t dma_addr)
-{
-	return __phys_to_pfn(dma_to_phys(dev, dma_addr));
-}
 #endif /* CONFIG_DMA_DIRECT_REMAP */
-- 
cgit v1.2.3


From 3acac065508f6cc60ac9d3e4b7c6cc37fd91d531 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 29 Oct 2019 11:06:32 +0100
Subject: dma-mapping: merge the generic remapping helpers into dma-direct

Integrate the generic dma remapping implementation into the main flow.
This prepares for architectures like xtensa that use an uncached
segment for pages in the kernel mapping, but can also remap highmem
from CMA.  To simplify that implementation we now always deduct the
page from the physical address via the DMA address instead of the
virtual address.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Max Filippov <jcmvbkbc@gmail.com>
---
 kernel/dma/direct.c | 60 ++++++++++++++++++++++++++++++++++++++++++-----------
 kernel/dma/remap.c  | 49 -------------------------------------------
 2 files changed, 48 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 58beaa9ddd27..22a2e0833862 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -12,6 +12,7 @@
 #include <linux/dma-contiguous.h>
 #include <linux/dma-noncoherent.h>
 #include <linux/pfn.h>
+#include <linux/vmalloc.h>
 #include <linux/set_memory.h>
 #include <linux/swiotlb.h>
 
@@ -137,6 +138,15 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	struct page *page;
 	void *ret;
 
+	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    dma_alloc_need_uncached(dev, attrs) &&
+	    !gfpflags_allow_blocking(gfp)) {
+		ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp);
+		if (!ret)
+			return NULL;
+		goto done;
+	}
+
 	page = __dma_direct_alloc_pages(dev, size, gfp, attrs);
 	if (!page)
 		return NULL;
@@ -146,9 +156,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		/* remove any dirty cache lines on the kernel alias */
 		if (!PageHighMem(page))
 			arch_dma_prep_coherent(page, size);
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
 		/* return the page pointer as the opaque cookie */
-		return page;
+		ret = page;
+		goto done;
+	}
+
+	if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	     dma_alloc_need_uncached(dev, attrs)) ||
+	    (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+		/* remove any dirty cache lines on the kernel alias */
+		arch_dma_prep_coherent(page, PAGE_ALIGN(size));
+
+		/* create a coherent mapping */
+		ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size),
+				dma_pgprot(dev, PAGE_KERNEL, attrs),
+				__builtin_return_address(0));
+		if (!ret) {
+			dma_free_contiguous(dev, page, size);
+			return ret;
+		}
+
+		memset(ret, 0, size);
+		goto done;
 	}
 
 	if (PageHighMem(page)) {
@@ -164,12 +193,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 	}
 
 	ret = page_address(page);
-	if (force_dma_unencrypted(dev)) {
+	if (force_dma_unencrypted(dev))
 		set_memory_decrypted((unsigned long)ret, 1 << get_order(size));
-		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
-	} else {
-		*dma_handle = phys_to_dma(dev, page_to_phys(page));
-	}
+
 	memset(ret, 0, size);
 
 	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
@@ -177,7 +203,11 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
 		arch_dma_prep_coherent(page, size);
 		ret = uncached_kernel_address(ret);
 	}
-
+done:
+	if (force_dma_unencrypted(dev))
+		*dma_handle = __phys_to_dma(dev, page_to_phys(page));
+	else
+		*dma_handle = phys_to_dma(dev, page_to_phys(page));
 	return ret;
 }
 
@@ -193,19 +223,24 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
 		return;
 	}
 
+	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+	    dma_free_from_pool(cpu_addr, PAGE_ALIGN(size)))
+		return;
+
 	if (force_dma_unencrypted(dev))
 		set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
 
-	if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
-	    dma_alloc_need_uncached(dev, attrs))
-		cpu_addr = cached_kernel_address(cpu_addr);
-	dma_free_contiguous(dev, virt_to_page(cpu_addr), size);
+	if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+		vunmap(cpu_addr);
+
+	dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
 }
 
 void *dma_direct_alloc(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 	return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs);
@@ -215,6 +250,7 @@ void dma_direct_free(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs)
 {
 	if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) &&
+	    !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
 	    dma_alloc_need_uncached(dev, attrs))
 		arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
 	else
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c
index 3c49499ee6b0..d47bd40fc0f5 100644
--- a/kernel/dma/remap.c
+++ b/kernel/dma/remap.c
@@ -210,53 +210,4 @@ bool dma_free_from_pool(void *start, size_t size)
 	gen_pool_free(atomic_pool, (unsigned long)start, size);
 	return true;
 }
-
-void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		gfp_t flags, unsigned long attrs)
-{
-	struct page *page = NULL;
-	void *ret;
-
-	size = PAGE_ALIGN(size);
-
-	if (!gfpflags_allow_blocking(flags)) {
-		ret = dma_alloc_from_pool(size, &page, flags);
-		if (!ret)
-			return NULL;
-		goto done;
-	}
-
-	page = __dma_direct_alloc_pages(dev, size, flags, attrs);
-	if (!page)
-		return NULL;
-
-	/* remove any dirty cache lines on the kernel alias */
-	arch_dma_prep_coherent(page, size);
-
-	/* create a coherent mapping */
-	ret = dma_common_contiguous_remap(page, size,
-			dma_pgprot(dev, PAGE_KERNEL, attrs),
-			__builtin_return_address(0));
-	if (!ret) {
-		dma_free_contiguous(dev, page, size);
-		return ret;
-	}
-
-	memset(ret, 0, size);
-done:
-	*dma_handle = phys_to_dma(dev, page_to_phys(page));
-	return ret;
-}
-
-void arch_dma_free(struct device *dev, size_t size, void *vaddr,
-		dma_addr_t dma_handle, unsigned long attrs)
-{
-	if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) {
-		phys_addr_t phys = dma_to_phys(dev, dma_handle);
-		struct page *page = pfn_to_page(__phys_to_pfn(phys));
-
-		vunmap(vaddr);
-		dma_free_contiguous(dev, page, size);
-	}
-}
 #endif /* CONFIG_DMA_DIRECT_REMAP */
-- 
cgit v1.2.3


From d3694f30732fd2a334b93f087033c5a5836f7aba Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Oct 2019 12:32:04 -0700
Subject: dma-debug: reorder struct dma_debug_entry fields

Move all fields used during exact match lookups to the first cache line.
This makes debug_dma_mapping_error() and friends about 50% faster.

Since it removes two 32bit holes, force a cacheline alignment on struct
dma_debug_entry.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 4ad74f5987ea..a5b85dabfb8c 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -54,33 +54,33 @@ enum map_err_types {
  * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping
  * @list: node on pre-allocated free_entries list
  * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent
- * @type: single, page, sg, coherent
- * @pfn: page frame of the start address
- * @offset: offset of mapping relative to pfn
  * @size: length of the mapping
+ * @type: single, page, sg, coherent
  * @direction: enum dma_data_direction
  * @sg_call_ents: 'nents' from dma_map_sg
  * @sg_mapped_ents: 'mapped_ents' from dma_map_sg
+ * @pfn: page frame of the start address
+ * @offset: offset of mapping relative to pfn
  * @map_err_type: track whether dma_mapping_error() was checked
  * @stacktrace: support backtraces when a violation is detected
  */
 struct dma_debug_entry {
 	struct list_head list;
 	struct device    *dev;
-	int              type;
-	unsigned long	 pfn;
-	size_t		 offset;
 	u64              dev_addr;
 	u64              size;
+	int              type;
 	int              direction;
 	int		 sg_call_ents;
 	int		 sg_mapped_ents;
+	unsigned long	 pfn;
+	size_t		 offset;
 	enum map_err_types  map_err_type;
 #ifdef CONFIG_STACKTRACE
 	unsigned int	stack_len;
 	unsigned long	stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];
 #endif
-};
+} ____cacheline_aligned_in_smp;
 
 typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *);
 
-- 
cgit v1.2.3


From 5e76f564572b85735de4b75a5e73b514be2562be Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 30 Oct 2019 11:48:44 -0700
Subject: dma-debug: increase HASH_SIZE

With modern NIC, it is not unusual having about ~256,000 active dma
mappings and a hash size of 1024 buckets is too small.

Forcing full cache line per bucket does not seem useful, especially now
that we have contention on free_entries_lock for allocations and freeing
of entries.  Better use the space to fit more buckets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index a5b85dabfb8c..004496654aaa 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -27,7 +27,7 @@
 
 #include <asm/sections.h>
 
-#define HASH_SIZE       1024ULL
+#define HASH_SIZE       16384ULL
 #define HASH_FN_SHIFT   13
 #define HASH_FN_MASK    (HASH_SIZE - 1)
 
@@ -87,7 +87,7 @@ typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *);
 struct hash_bucket {
 	struct list_head list;
 	spinlock_t lock;
-} ____cacheline_aligned_in_smp;
+};
 
 /* Hash list to save the allocated dma addresses */
 static struct hash_bucket dma_entry_hash[HASH_SIZE];
-- 
cgit v1.2.3


From 9a066357184485784f782719093ff804d05b85db Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 8 Oct 2019 21:05:52 +0900
Subject: kheaders: remove unneeded 'cat' command piped to 'head' / 'tail'

The 'head' and 'tail' commands can take a file path directly.
So, you do not need to run 'cat'.

  cat kernel/kheaders.md5 | head -1

... is equivalent to:

  head -1 kernel/kheaders.md5

and the latter saves forking one process.

While I was here, I replaced 'head -1' with 'head -n 1'.

I also replaced '==' with '=' since we do not have a good reason to
use the bashism.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 kernel/gen_kheaders.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 5a0fc0b0403a..b8054b0d5010 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -41,10 +41,10 @@ obj_files_md5="$(find $dir_list -name "*.h"			   |
 this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
 if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
 if [ -f kernel/kheaders.md5 ] &&
-	[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
-	[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
-	[ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] &&
-	[ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+	[ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] &&
+	[ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] &&
+	[ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
+	[ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then
 		exit
 fi
 
-- 
cgit v1.2.3


From 0e11773e76098729552b750ccff79374d1e62002 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 8 Oct 2019 21:05:53 +0900
Subject: kheaders: optimize md5sum calculation for in-tree builds

This script computes md5sum of headers in srctree and in objtree.
However, when we are building in-tree, we know the srctree and the
objtree are the same. That is, we end up with the same computation
twice. In fact, the first two lines of kernel/kheaders.md5 are always
the same for in-tree builds.

Unify the two md5sum calculations.

For in-tree builds ($building_out_of_srctree is empty), we check
only two directories, "include", and "arch/$SRCARCH/include".

For out-of-tree builds ($building_out_of_srctree is 1), we check
4 directories, "$srctree/include", "$srctree/arch/$SRCARCH/include",
"include", and "arch/$SRCARCH/include" since we know they are all
different.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 kernel/gen_kheaders.sh | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index b8054b0d5010..6ff86e62787f 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -21,29 +21,30 @@ arch/$SRCARCH/include/
 # Uncomment it for debugging.
 # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
 # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
-# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter
-# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter
+# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter
+
+all_dirs=
+if [ "$building_out_of_srctree" ]; then
+	for d in $dir_list; do
+		all_dirs="$all_dirs $srctree/$d"
+	done
+fi
+all_dirs="$all_dirs $dir_list"
 
 # include/generated/compile.h is ignored because it is touched even when none
 # of the source files changed. This causes pointless regeneration, so let us
 # ignore them for md5 calculation.
-pushd $srctree > /dev/null
-src_files_md5="$(find $dir_list -name "*.h"			   |
-		grep -v "include/generated/compile.h"		   |
-		grep -v "include/generated/autoconf.h"		   |
-		xargs ls -l | md5sum | cut -d ' ' -f1)"
-popd > /dev/null
-obj_files_md5="$(find $dir_list -name "*.h"			   |
-		grep -v "include/generated/compile.h"		   |
-		grep -v "include/generated/autoconf.h"		   |
+headers_md5="$(find $all_dirs -name "*.h"			|
+		grep -v "include/generated/compile.h"	|
+		grep -v "include/generated/autoconf.h"	|
 		xargs ls -l | md5sum | cut -d ' ' -f1)"
+
 # Any changes to this script will also cause a rebuild of the archive.
 this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
 if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
 if [ -f kernel/kheaders.md5 ] &&
-	[ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] &&
-	[ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] &&
-	[ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
+	[ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] &&
+	[ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
 	[ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then
 		exit
 fi
@@ -79,8 +80,7 @@ find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \
     --owner=0 --group=0 --numeric-owner --no-recursion \
     -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null
 
-echo "$src_files_md5" >  kernel/kheaders.md5
-echo "$obj_files_md5" >> kernel/kheaders.md5
+echo $headers_md5 > kernel/kheaders.md5
 echo "$this_file_md5" >> kernel/kheaders.md5
 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
 
-- 
cgit v1.2.3


From ea79e5168be644fdaf7d4e6a73eceaf07b3da76a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 8 Oct 2019 21:05:54 +0900
Subject: kheaders: optimize header copy for in-tree builds

This script copies headers by the cpio command twice; first from
srctree, and then from objtree. However, when we building in-tree,
we know the srctree and the objtree are the same. That is, all the
headers copied by the first cpio are overwritten by the second one.

Skip the first cpio when we are building in-tree.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 kernel/gen_kheaders.sh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 6ff86e62787f..0f7752dd93a6 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -56,14 +56,16 @@ fi
 rm -rf $cpio_dir
 mkdir $cpio_dir
 
-pushd $srctree > /dev/null
-for f in $dir_list;
-	do find "$f" -name "*.h";
-done | cpio --quiet -pd $cpio_dir
-popd > /dev/null
+if [ "$building_out_of_srctree" ]; then
+	pushd $srctree > /dev/null
+	for f in $dir_list
+		do find "$f" -name "*.h";
+	done | cpio --quiet -pd $cpio_dir
+	popd > /dev/null
+fi
 
-# The second CPIO can complain if files already exist which can
-# happen with out of tree builds. Just silence CPIO for now.
+# The second CPIO can complain if files already exist which can happen with out
+# of tree builds having stale headers in srctree. Just silence CPIO for now.
 for f in $dir_list;
 	do find "$f" -name "*.h";
 done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
-- 
cgit v1.2.3


From 1463f74f492eea7191f0178e01f3d38371a48210 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 8 Oct 2019 21:05:55 +0900
Subject: kheaders: remove the last bashism to allow sh to run it

'pushd' ... 'popd' is the last bash-specific code in this script.
One way to avoid it is to run the code in a sub-shell.

With that addressed, you can run this script with sh.

I replaced $(BASH) with $(CONFIG_SHELL), and I changed the hashbang
to #!/bin/sh.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 kernel/Makefile        |  2 +-
 kernel/gen_kheaders.sh | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index daad787fb795..42557f251fea 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -128,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 
 quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
-      cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@
+      cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
 $(obj)/kheaders_data.tar.xz: FORCE
 	$(call cmd,genikh)
 
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 0f7752dd93a6..dc5744b93f8c 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 # SPDX-License-Identifier: GPL-2.0
 
 # This script generates an archive consisting of kernel headers
@@ -57,11 +57,12 @@ rm -rf $cpio_dir
 mkdir $cpio_dir
 
 if [ "$building_out_of_srctree" ]; then
-	pushd $srctree > /dev/null
-	for f in $dir_list
-		do find "$f" -name "*.h";
-	done | cpio --quiet -pd $cpio_dir
-	popd > /dev/null
+	(
+		cd $srctree
+		for f in $dir_list
+			do find "$f" -name "*.h";
+		done | cpio --quiet -pd $cpio_dir
+	)
 fi
 
 # The second CPIO can complain if files already exist which can happen with out
-- 
cgit v1.2.3


From f276031b4e2f4c961ed6d8a42f0f0124ccac2e09 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 8 Oct 2019 21:05:56 +0900
Subject: kheaders: explain why include/config/autoconf.h is excluded from
 md5sum

This comment block explains why include/generated/compile.h is omitted,
but nothing about include/generated/autoconf.h, which might be more
difficult to understand. Add more comments.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
---
 kernel/gen_kheaders.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index dc5744b93f8c..e13ca842eb7e 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -32,8 +32,15 @@ fi
 all_dirs="$all_dirs $dir_list"
 
 # include/generated/compile.h is ignored because it is touched even when none
-# of the source files changed. This causes pointless regeneration, so let us
-# ignore them for md5 calculation.
+# of the source files changed.
+#
+# When Kconfig regenerates include/generated/autoconf.h, its timestamp is
+# updated, but the contents might be still the same. When any CONFIG option is
+# changed, Kconfig touches the corresponding timestamp file include/config/*.h.
+# Hence, the md5sum detects the configuration change anyway. We do not need to
+# check include/generated/autoconf.h explicitly.
+#
+# Ignore them for md5 calculation to avoid pointless regeneration.
 headers_md5="$(find $all_dirs -name "*.h"			|
 		grep -v "include/generated/compile.h"	|
 		grep -v "include/generated/autoconf.h"	|
-- 
cgit v1.2.3


From c1d51f684c72b5eb2aecbbd47be3a2977a2dc903 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Nov 2019 15:25:12 +0100
Subject: cpuidle: Use nanoseconds as the unit of time

Currently, the cpuidle subsystem uses microseconds as the unit of
time which (among other things) causes the idle loop to incur some
integer division overhead for no clear benefit.

In order to allow cpuidle to measure time in nanoseconds, add two
new fields, exit_latency_ns and target_residency_ns, to represent the
exit latency and target residency of an idle state in nanoseconds,
respectively, to struct cpuidle_state and initialize them with the
help of the corresponding values in microseconds provided by drivers.
Additionally, change cpuidle_governor_latency_req() to return the
idle state exit latency constraint in nanoseconds.

Also meeasure idle state residency (last_residency_ns in struct
cpuidle_device and time_ns in struct cpuidle_driver) in nanoseconds
and update the cpuidle core and governors accordingly.

However, the menu governor still computes typical intervals in
microseconds to avoid integer overflows.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Doug Smythies <dsmythies@telus.net>
Tested-by: Doug Smythies <dsmythies@telus.net>
---
 kernel/sched/idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8dad5aa600ea..1aa260702b38 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 	 * update no idle residency and return.
 	 */
 	if (current_clr_polling_and_test()) {
-		dev->last_residency = 0;
+		dev->last_residency_ns = 0;
 		local_irq_enable();
 		return -EBUSY;
 	}
-- 
cgit v1.2.3


From 20d087368d38c7350a4519a3b316ef7eb2504692 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 8 Nov 2019 21:34:25 +0100
Subject: time: Optimize ns_to_timespec64()

ns_to_timespec64() calls div_s64_rem(), which is a rather slow function on
32-bit architectures, as it cannot take advantage of the do_div()
optimizations for constant arguments.

Open-code the div_s64_rem() function in ns_to_timespec64(), so a constant
divider can be passed into the optimized div_u64_rem() function.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20191108203435.112759-3-arnd@arndb.de
---
 kernel/time/time.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index 5c54ca632d08..45a358953f09 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -550,18 +550,21 @@ EXPORT_SYMBOL(set_normalized_timespec64);
  */
 struct timespec64 ns_to_timespec64(const s64 nsec)
 {
-	struct timespec64 ts;
+	struct timespec64 ts = { 0, 0 };
 	s32 rem;
 
-	if (!nsec)
-		return (struct timespec64) {0, 0};
-
-	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
-	if (unlikely(rem < 0)) {
-		ts.tv_sec--;
-		rem += NSEC_PER_SEC;
+	if (likely(nsec > 0)) {
+		ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+		ts.tv_nsec = rem;
+	} else if (nsec < 0) {
+		/*
+		 * With negative times, tv_sec points to the earlier
+		 * second, and tv_nsec counts the nanoseconds since
+		 * then, so tv_nsec is always a positive number.
+		 */
+		ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
+		ts.tv_nsec = NSEC_PER_SEC - rem - 1;
 	}
-	ts.tv_nsec = rem;
 
 	return ts;
 }
-- 
cgit v1.2.3


From 1d6acc18fee71a0db6e4fbbfbdb247e0bd5b0655 Mon Sep 17 00:00:00 2001
From: Mukesh Ojha <mojha@codeaurora.org>
Date: Tue, 15 Oct 2019 13:03:39 +0530
Subject: time: Fix spelling mistake in comment

witin => within

Signed-off-by: Mukesh Ojha <mojha@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/1571124819-9639-1-git-send-email-mojha@codeaurora.org
---
 kernel/time/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index 45a358953f09..ea6e7e47cc37 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 		return error;
 
 	if (tz) {
-		/* Verify we're witin the +-15 hrs range */
+		/* Verify we're within the +-15 hrs range */
 		if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
 			return -EINVAL;
 
-- 
cgit v1.2.3


From cf25e24db61cc9df42c47485a2ec2bff4e9a3692 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Nov 2019 11:07:58 +0100
Subject: time: Rename tsk->real_start_time to ->start_boottime

Since it stores CLOCK_BOOTTIME, not, as the name suggests,
CLOCK_REALTIME, let's rename ->real_start_time to ->start_bootime.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf53125210..1392ee8f4848 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2130,7 +2130,7 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 
 	p->start_time = ktime_get_ns();
-	p->real_start_time = ktime_get_boottime_ns();
+	p->start_boottime = ktime_get_boottime_ns();
 
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
-- 
cgit v1.2.3


From 763e34e74bb7d5c316015e2e39fcc8520bfd071c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 13:07:06 -0500
Subject: ftrace: Add register_ftrace_direct()

Add the start of the functionality to allow other trampolines to use the
ftrace mcount/fentry/nop location. This adds two new functions:

 register_ftrace_direct() and unregister_ftrace_direct()

Both take two parameters: the first is the instruction address of where the
mcount/fentry/nop exists, and the second is the trampoline to have that
location called.

This will handle cases where ftrace is already used on that same location,
and will make it still work, where the registered direct called trampoline
will get called after all the registered ftrace callers are handled.

Currently, it will not allow for IP_MODIFY functions to be called at the
same locations, which include some kprobes and live kernel patching.

At this point, no architecture supports this. This is only the start of
implementing the framework.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig  |   8 ++
 kernel/trace/ftrace.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 272 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e08527f50d2a..624a05e99b0b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE
 config HAVE_DYNAMIC_FTRACE_WITH_REGS
 	bool
 
+config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	bool
+
 config HAVE_FTRACE_MCOUNT_RECORD
 	bool
 	help
@@ -557,6 +560,11 @@ config DYNAMIC_FTRACE_WITH_REGS
 	depends on DYNAMIC_FTRACE
 	depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
 
+config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	def_bool y
+	depends on DYNAMIC_FTRACE
+	depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b0e7f03919de..329a3f3789a1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1023,6 +1023,7 @@ static bool update_all_ops;
 struct ftrace_func_entry {
 	struct hlist_node hlist;
 	unsigned long ip;
+	unsigned long direct; /* for direct lookup only */
 };
 
 struct ftrace_func_probe {
@@ -1730,6 +1731,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 			if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
 				return false;
 
+			if (ops->flags & FTRACE_OPS_FL_DIRECT)
+				rec->flags |= FTRACE_FL_DIRECT;
+
 			/*
 			 * If there's only a single callback registered to a
 			 * function, and the ops has a trampoline registered
@@ -1757,6 +1761,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
 				return false;
 			rec->flags--;
 
+			/*
+			 * Only the internal direct_ops should have the
+			 * DIRECT flag set. Thus, if it is removing a
+			 * function, then that function should no longer
+			 * be direct.
+			 */
+			if (ops->flags & FTRACE_OPS_FL_DIRECT)
+				rec->flags &= ~FTRACE_FL_DIRECT;
+
 			/*
 			 * If the rec had REGS enabled and the ops that is
 			 * being removed had REGS set, then see if there is
@@ -2092,15 +2105,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 	 * If enabling and the REGS flag does not match the REGS_EN, or
 	 * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
 	 * this record. Set flags to fail the compare against ENABLED.
+	 * Same for direct calls.
 	 */
 	if (flag) {
-		if (!(rec->flags & FTRACE_FL_REGS) != 
+		if (!(rec->flags & FTRACE_FL_REGS) !=
 		    !(rec->flags & FTRACE_FL_REGS_EN))
 			flag |= FTRACE_FL_REGS;
 
-		if (!(rec->flags & FTRACE_FL_TRAMP) != 
+		if (!(rec->flags & FTRACE_FL_TRAMP) !=
 		    !(rec->flags & FTRACE_FL_TRAMP_EN))
 			flag |= FTRACE_FL_TRAMP;
+
+		/*
+		 * Direct calls are special, as count matters.
+		 * We must test the record for direct, if the
+		 * DIRECT and DIRECT_EN do not match, but only
+		 * if the count is 1. That's because, if the
+		 * count is something other than one, we do not
+		 * want the direct enabled (it will be done via the
+		 * direct helper). But if DIRECT_EN is set, and
+		 * the count is not one, we need to clear it.
+		 */
+		if (ftrace_rec_count(rec) == 1) {
+			if (!(rec->flags & FTRACE_FL_DIRECT) !=
+			    !(rec->flags & FTRACE_FL_DIRECT_EN))
+				flag |= FTRACE_FL_DIRECT;
+		} else if (rec->flags & FTRACE_FL_DIRECT_EN) {
+			flag |= FTRACE_FL_DIRECT;
+		}
 	}
 
 	/* If the state of this record hasn't changed, then do nothing */
@@ -2125,6 +2157,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 				else
 					rec->flags &= ~FTRACE_FL_TRAMP_EN;
 			}
+			if (flag & FTRACE_FL_DIRECT) {
+				/*
+				 * If there's only one user (direct_ops helper)
+				 * then we can call the direct function
+				 * directly (no ftrace trampoline).
+				 */
+				if (ftrace_rec_count(rec) == 1) {
+					if (rec->flags & FTRACE_FL_DIRECT)
+						rec->flags |= FTRACE_FL_DIRECT_EN;
+					else
+						rec->flags &= ~FTRACE_FL_DIRECT_EN;
+				} else {
+					/*
+					 * Can only call directly if there's
+					 * only one callback to the function.
+					 */
+					rec->flags &= ~FTRACE_FL_DIRECT_EN;
+				}
+			}
 		}
 
 		/*
@@ -2154,7 +2205,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
 			 * and REGS states. The _EN flags must be disabled though.
 			 */
 			rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN |
-					FTRACE_FL_REGS_EN);
+					FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN);
 	}
 
 	ftrace_bug_type = FTRACE_BUG_NOP;
@@ -2309,6 +2360,51 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
 	return NULL;
 }
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/* Protected by rcu_tasks for reading, and direct_mutex for writing */
+static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static DEFINE_MUTEX(direct_mutex);
+
+/*
+ * Search the direct_functions hash to see if the given instruction pointer
+ * has a direct caller attached to it.
+ */
+static unsigned long find_rec_direct(unsigned long ip)
+{
+	struct ftrace_func_entry *entry;
+
+	entry = __ftrace_lookup_ip(direct_functions, ip);
+	if (!entry)
+		return 0;
+
+	return entry->direct;
+}
+
+static void call_direct_funcs(unsigned long ip, unsigned long pip,
+			      struct ftrace_ops *ops, struct pt_regs *regs)
+{
+	unsigned long addr;
+
+	addr = find_rec_direct(ip);
+	if (!addr)
+		return;
+
+	arch_ftrace_set_direct_caller(regs, addr);
+}
+
+struct ftrace_ops direct_ops = {
+	.func		= call_direct_funcs,
+	.flags		= FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
+			  | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
+			  | FTRACE_OPS_FL_PERMANENT,
+};
+#else
+static inline unsigned long find_rec_direct(unsigned long ip)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 /**
  * ftrace_get_addr_new - Get the call address to set to
  * @rec:  The ftrace record descriptor
@@ -2322,6 +2418,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
 unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
 {
 	struct ftrace_ops *ops;
+	unsigned long addr;
+
+	if ((rec->flags & FTRACE_FL_DIRECT) &&
+	    (ftrace_rec_count(rec) == 1)) {
+		addr = find_rec_direct(rec->ip);
+		if (addr)
+			return addr;
+		WARN_ON_ONCE(1);
+	}
 
 	/* Trampolines take precedence over regs */
 	if (rec->flags & FTRACE_FL_TRAMP) {
@@ -2354,6 +2459,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
 unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
 {
 	struct ftrace_ops *ops;
+	unsigned long addr;
+
+	/* Direct calls take precedence over trampolines */
+	if (rec->flags & FTRACE_FL_DIRECT_EN) {
+		addr = find_rec_direct(rec->ip);
+		if (addr)
+			return addr;
+		WARN_ON_ONCE(1);
+	}
 
 	/* Trampolines take precedence over regs */
 	if (rec->flags & FTRACE_FL_TRAMP_EN) {
@@ -3465,10 +3579,11 @@ static int t_show(struct seq_file *m, void *v)
 	if (iter->flags & FTRACE_ITER_ENABLED) {
 		struct ftrace_ops *ops;
 
-		seq_printf(m, " (%ld)%s%s",
+		seq_printf(m, " (%ld)%s%s%s",
 			   ftrace_rec_count(rec),
 			   rec->flags & FTRACE_FL_REGS ? " R" : "  ",
-			   rec->flags & FTRACE_FL_IPMODIFY ? " I" : "  ");
+			   rec->flags & FTRACE_FL_IPMODIFY ? " I" : "  ",
+			   rec->flags & FTRACE_FL_DIRECT ? " D" : "  ");
 		if (rec->flags & FTRACE_FL_TRAMP_EN) {
 			ops = ftrace_find_tramp_ops_any(rec);
 			if (ops) {
@@ -3484,6 +3599,13 @@ static int t_show(struct seq_file *m, void *v)
 		} else {
 			add_trampoline_func(m, NULL, rec);
 		}
+		if (rec->flags & FTRACE_FL_DIRECT) {
+			unsigned long direct;
+
+			direct = find_rec_direct(rec->ip);
+			if (direct)
+				seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
+		}
 	}	
 
 	seq_putc(m, '\n');
@@ -4815,6 +4937,143 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
 	return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
 }
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/**
+ * register_ftrace_direct - Call a custom trampoline directly
+ * @ip: The address of the nop at the beginning of a function
+ * @addr: The address of the trampoline to call at @ip
+ *
+ * This is used to connect a direct call from the nop location (@ip)
+ * at the start of ftrace traced functions. The location that it calls
+ * (@addr) must be able to handle a direct call, and save the parameters
+ * of the function being traced, and restore them (or inject new ones
+ * if needed), before returning.
+ *
+ * Returns:
+ *  0 on success
+ *  -EBUSY - Another direct function is already attached (there can be only one)
+ *  -ENODEV - @ip does not point to a ftrace nop location (or not supported)
+ *  -ENOMEM - There was an allocation failure.
+ */
+int register_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+	struct ftrace_func_entry *entry;
+	struct ftrace_hash *free_hash = NULL;
+	struct dyn_ftrace *rec;
+	int ret = -EBUSY;
+
+	mutex_lock(&direct_mutex);
+
+	/* See if there's a direct function at @ip already */
+	if (find_rec_direct(ip))
+		goto out_unlock;
+
+	ret = -ENODEV;
+	rec = lookup_rec(ip, ip);
+	if (!rec)
+		goto out_unlock;
+
+	/*
+	 * Check if the rec says it has a direct call but we didn't
+	 * find one earlier?
+	 */
+	if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
+		goto out_unlock;
+
+	/* Make sure the ip points to the exact record */
+	ip = rec->ip;
+
+	ret = -ENOMEM;
+	if (ftrace_hash_empty(direct_functions) ||
+	    direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+		struct ftrace_hash *new_hash;
+		int size = ftrace_hash_empty(direct_functions) ? 0 :
+			direct_functions->count + 1;
+
+		if (size < 32)
+			size = 32;
+
+		new_hash = dup_hash(direct_functions, size);
+		if (!new_hash)
+			goto out_unlock;
+
+		free_hash = direct_functions;
+		direct_functions = new_hash;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		goto out_unlock;
+
+	entry->ip = ip;
+	entry->direct = addr;
+	__add_hash_entry(direct_functions, entry);
+
+	ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
+	if (ret)
+		remove_hash_entry(direct_functions, entry);
+
+	if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
+		ret = register_ftrace_function(&direct_ops);
+		if (ret)
+			ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+	}
+
+	if (ret)
+		kfree(entry);
+ out_unlock:
+	mutex_unlock(&direct_mutex);
+
+	if (free_hash) {
+		synchronize_rcu_tasks();
+		free_ftrace_hash(free_hash);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
+
+int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+	struct ftrace_func_entry *entry;
+	struct dyn_ftrace *rec;
+	int ret = -ENODEV;
+
+	mutex_lock(&direct_mutex);
+
+	entry = __ftrace_lookup_ip(direct_functions, ip);
+	if (!entry) {
+		/* OK if it is off by a little */
+		rec = lookup_rec(ip, ip);
+		if (!rec || rec->ip == ip)
+			goto out_unlock;
+
+		entry = __ftrace_lookup_ip(direct_functions, rec->ip);
+		if (!entry) {
+			WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+			goto out_unlock;
+		}
+
+		WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
+	}
+
+	if (direct_functions->count == 1)
+		unregister_ftrace_function(&direct_ops);
+
+	ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
+
+	WARN_ON(ret);
+
+	remove_hash_entry(direct_functions, entry);
+
+ out_unlock:
+	mutex_unlock(&direct_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 /**
  * ftrace_set_filter_ip - set a function to filter on in ftrace by address
  * @ops - the ops to set the filter with
-- 
cgit v1.2.3


From 013bf0da0474816f57739daa006c8564ad7396a3 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 13:11:27 -0500
Subject: ftrace: Add ftrace_find_direct_func()

As function_graph tracer modifies the return address to insert a trampoline
to trace the return of a function, it must be aware of a direct caller, as
when it gets called, the function's return address may not be at on the
stack where it expects. It may have to see if that return address points to
the a direct caller and adjust if it is.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 329a3f3789a1..c4446eabacbe 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4938,6 +4938,46 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+
+struct ftrace_direct_func {
+	struct list_head	next;
+	unsigned long		addr;
+	int			count;
+};
+
+static LIST_HEAD(ftrace_direct_funcs);
+
+/**
+ * ftrace_find_direct_func - test an address if it is a registered direct caller
+ * @addr: The address of a registered direct caller
+ *
+ * This searches to see if a ftrace direct caller has been registered
+ * at a specific address, and if so, it returns a descriptor for it.
+ *
+ * This can be used by architecture code to see if an address is
+ * a direct caller (trampoline) attached to a fentry/mcount location.
+ * This is useful for the function_graph tracer, as it may need to
+ * do adjustments if it traced a location that also has a direct
+ * trampoline attached to it.
+ */
+struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
+{
+	struct ftrace_direct_func *entry;
+	bool found = false;
+
+	/* May be called by fgraph trampoline (protected by rcu tasks) */
+	list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
+		if (entry->addr == addr) {
+			found = true;
+			break;
+		}
+	}
+	if (found)
+		return entry;
+
+	return NULL;
+}
+
 /**
  * register_ftrace_direct - Call a custom trampoline directly
  * @ip: The address of the nop at the beginning of a function
@@ -4957,6 +4997,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
  */
 int register_ftrace_direct(unsigned long ip, unsigned long addr)
 {
+	struct ftrace_direct_func *direct;
 	struct ftrace_func_entry *entry;
 	struct ftrace_hash *free_hash = NULL;
 	struct dyn_ftrace *rec;
@@ -5005,6 +5046,18 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 	if (!entry)
 		goto out_unlock;
 
+	direct = ftrace_find_direct_func(addr);
+	if (!direct) {
+		direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+		if (!direct) {
+			kfree(entry);
+			goto out_unlock;
+		}
+		direct->addr = addr;
+		direct->count = 0;
+		list_add_rcu(&direct->next, &ftrace_direct_funcs);
+	}
+
 	entry->ip = ip;
 	entry->direct = addr;
 	__add_hash_entry(direct_functions, entry);
@@ -5019,8 +5072,20 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 			ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
 	}
 
-	if (ret)
+	if (ret) {
 		kfree(entry);
+		if (!direct->count) {
+			list_del_rcu(&direct->next);
+			synchronize_rcu_tasks();
+			kfree(direct);
+			if (free_hash)
+				free_ftrace_hash(free_hash);
+			free_hash = NULL;
+		}
+	} else {
+		if (!direct->count)
+			direct->count++;
+	}
  out_unlock:
 	mutex_unlock(&direct_mutex);
 
@@ -5036,6 +5101,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct);
 int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 {
 	struct ftrace_func_entry *entry;
+	struct ftrace_direct_func *direct;
 	struct dyn_ftrace *rec;
 	int ret = -ENODEV;
 
@@ -5066,6 +5132,17 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 
 	remove_hash_entry(direct_functions, entry);
 
+	direct = ftrace_find_direct_func(addr);
+	if (!WARN_ON(!direct)) {
+		/* This is the good path (see the ! before WARN) */
+		direct->count--;
+		WARN_ON(direct->count < 0);
+		if (!direct->count) {
+			list_del_rcu(&direct->next);
+			synchronize_rcu_tasks();
+			kfree(direct);
+		}
+	}
  out_unlock:
 	mutex_unlock(&direct_mutex);
 
-- 
cgit v1.2.3


From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 8 Nov 2019 13:12:57 -0500
Subject: ftrace/x86: Add a counter to test function_graph with direct

As testing for direct calls from the function graph tracer adds a little
overhead (which is a lot when tracing every function), add a counter that
can be used to test if function_graph tracer needs to test for a direct
caller or not.

It would have been nicer if we could use a static branch, but the static
branch logic fails when used within the function graph tracer trampoline.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c4446eabacbe..f9456346ec66 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
 /* Protected by rcu_tasks for reading, and direct_mutex for writing */
 static struct ftrace_hash *direct_functions = EMPTY_HASH;
 static DEFINE_MUTEX(direct_mutex);
+int ftrace_direct_func_count;
 
 /*
  * Search the direct_functions hash to see if the given instruction pointer
@@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 		direct->addr = addr;
 		direct->count = 0;
 		list_add_rcu(&direct->next, &ftrace_direct_funcs);
+		ftrace_direct_func_count++;
 	}
 
 	entry->ip = ip;
@@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 			if (free_hash)
 				free_ftrace_hash(free_hash);
 			free_hash = NULL;
+			ftrace_direct_func_count--;
 		}
 	} else {
 		if (!direct->count)
@@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 			list_del_rcu(&direct->next);
 			synchronize_rcu_tasks();
 			kfree(direct);
+			ftrace_direct_func_count--;
 		}
 	}
  out_unlock:
-- 
cgit v1.2.3


From da537f0aef1372c5204356a7df06be8769467b7b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 1 Oct 2019 14:38:07 -0400
Subject: ftrace: Add information on number of page groups allocated

Looking for ways to shrink the size of the dyn_ftrace structure, knowing the
information about how many pages and the number of groups of those pages, is
useful in working out the best ways to save on memory.

This adds one info print on how many groups of pages were used to allocate
the ftrace dyn_ftrace structures, and also shows the number of pages and
groups in the dyn_ftrace_total_info (which is used for debugging).

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 14 ++++++++++++++
 kernel/trace/trace.c  | 21 +++++++++++++++------
 kernel/trace/trace.h  |  2 ++
 3 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9456346ec66..d2d488c43a6a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2991,6 +2991,8 @@ static void ftrace_shutdown_sysctl(void)
 
 static u64		ftrace_update_time;
 unsigned long		ftrace_update_tot_cnt;
+unsigned long		ftrace_number_of_pages;
+unsigned long		ftrace_number_of_groups;
 
 static inline int ops_traces_mod(struct ftrace_ops *ops)
 {
@@ -3115,6 +3117,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count)
 		goto again;
 	}
 
+	ftrace_number_of_pages += 1 << order;
+	ftrace_number_of_groups++;
+
 	cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
 	pg->size = cnt;
 
@@ -3170,6 +3175,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
 		start_pg = pg->next;
 		kfree(pg);
 		pg = start_pg;
+		ftrace_number_of_pages -= 1 << order;
+		ftrace_number_of_groups--;
 	}
 	pr_info("ftrace: FAILED to allocate memory for functions\n");
 	return NULL;
@@ -6173,6 +6180,8 @@ void ftrace_release_mod(struct module *mod)
 		free_pages((unsigned long)pg->records, order);
 		tmp_page = pg->next;
 		kfree(pg);
+		ftrace_number_of_pages -= 1 << order;
+		ftrace_number_of_groups--;
 	}
 }
 
@@ -6514,6 +6523,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 			*last_pg = pg->next;
 			order = get_count_order(pg->size / ENTRIES_PER_PAGE);
 			free_pages((unsigned long)pg->records, order);
+			ftrace_number_of_pages -= 1 << order;
+			ftrace_number_of_groups--;
 			kfree(pg);
 			pg = container_of(last_pg, struct ftrace_page, next);
 			if (!(*last_pg))
@@ -6569,6 +6580,9 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
+	pr_info("ftrace: allocated %ld pages with %ld groups\n",
+		ftrace_number_of_pages, ftrace_number_of_groups);
+
 	set_ftrace_early_filters();
 
 	return;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6a0ee9178365..5ea8c7c0f2d7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7583,14 +7583,23 @@ static ssize_t
 tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
-	unsigned long *p = filp->private_data;
-	char buf[64]; /* Not too big for a shallow stack */
+	ssize_t ret;
+	char *buf;
 	int r;
 
-	r = scnprintf(buf, 63, "%ld", *p);
-	buf[r++] = '\n';
+	/* 256 should be plenty to hold the amount needed */
+	buf = kmalloc(256, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
 
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n",
+		      ftrace_update_tot_cnt,
+		      ftrace_number_of_pages,
+		      ftrace_number_of_groups);
+
+	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+	kfree(buf);
+	return ret;
 }
 
 static const struct file_operations tracing_dyn_info_fops = {
@@ -8782,7 +8791,7 @@ static __init int tracer_init_tracefs(void)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
+			NULL, &tracing_dyn_info_fops);
 #endif
 
 	create_trace_instances(d_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d685c61085c0..8b590f10bc72 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -804,6 +804,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
+extern unsigned long ftrace_number_of_pages;
+extern unsigned long ftrace_number_of_groups;
 void ftrace_init_trace_array(struct trace_array *tr);
 #else
 static inline void ftrace_init_trace_array(struct trace_array *tr) { }
-- 
cgit v1.2.3


From 91edde2e6ae1dd5e33812f076f3fe4cb7ccbfdd0 Mon Sep 17 00:00:00 2001
From: "Viktor Rosendahl (BMW)" <viktor.rosendahl@gmail.com>
Date: Wed, 9 Oct 2019 00:08:21 +0200
Subject: ftrace: Implement fs notification for tracing_max_latency

This patch implements the feature that the tracing_max_latency file,
e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive
notifications through the fsnotify framework when a new latency is
available.

One particularly interesting use of this facility is when enabling
threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh,
together with the preempt/irqsoff tracers. This makes it possible to
implement a user space program that can, with equal probability,
obtain traces of latencies that occur immediately after each other in
spite of the fact that the preempt/irqsoff tracers operate in overwrite
mode.

This facility works with the hwlat, preempt/irqsoff, and wakeup
tracers.

The tracers may call the latency_fsnotify() from places such as
__schedule() or do_idle(); this makes it impossible to call
queue_work() directly without risking a deadlock. The same would
happen with a softirq,  kernel thread or tasklet. For this reason we
use the irq_work mechanism to call queue_work().

This patch creates a new workqueue. The reason for doing this is that
I wanted to use the WQ_UNBOUND and WQ_HIGHPRI flags.  My thinking was
that WQ_UNBOUND might help with the latency in some important cases.

If we use:

queue_work(system_highpri_wq, &tr->fsnotify_work);

then the work will (almost) always execute on the same CPU but if we are
unlucky that CPU could be too busy while there could be another CPU in
the system that would be able to process the work soon enough.

queue_work_on() could be used to queue the work on another CPU but it
seems difficult to select the right CPU.

Link: http://lkml.kernel.org/r/20191008220824.7911-2-viktor.rosendahl@gmail.com

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Viktor Rosendahl (BMW) <viktor.rosendahl@gmail.com>
[ Added max() to have one compare for max latency ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c       | 75 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h       | 18 +++++++++++
 kernel/trace/trace_hwlat.c | 11 ++++---
 3 files changed, 98 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5ea8c7c0f2d7..f093a433cb42 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -45,6 +45,9 @@
 #include <linux/trace.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/rt.h>
+#include <linux/fsnotify.h>
+#include <linux/irq_work.h>
+#include <linux/workqueue.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -1497,6 +1500,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 }
 
 unsigned long __read_mostly	tracing_thresh;
+static const struct file_operations tracing_max_lat_fops;
+
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+	defined(CONFIG_FSNOTIFY)
+
+static struct workqueue_struct *fsnotify_wq;
+
+static void latency_fsnotify_workfn(struct work_struct *work)
+{
+	struct trace_array *tr = container_of(work, struct trace_array,
+					      fsnotify_work);
+	fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
+		 tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+}
+
+static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
+{
+	struct trace_array *tr = container_of(iwork, struct trace_array,
+					      fsnotify_irqwork);
+	queue_work(fsnotify_wq, &tr->fsnotify_work);
+}
+
+static void trace_create_maxlat_file(struct trace_array *tr,
+				     struct dentry *d_tracer)
+{
+	INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn);
+	init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq);
+	tr->d_max_latency = trace_create_file("tracing_max_latency", 0644,
+					      d_tracer, &tr->max_latency,
+					      &tracing_max_lat_fops);
+}
+
+__init static int latency_fsnotify_init(void)
+{
+	fsnotify_wq = alloc_workqueue("tr_max_lat_wq",
+				      WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!fsnotify_wq) {
+		pr_err("Unable to allocate tr_max_lat_wq\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+late_initcall_sync(latency_fsnotify_init);
+
+void latency_fsnotify(struct trace_array *tr)
+{
+	if (!fsnotify_wq)
+		return;
+	/*
+	 * We cannot call queue_work(&tr->fsnotify_work) from here because it's
+	 * possible that we are called from __schedule() or do_idle(), which
+	 * could cause a deadlock.
+	 */
+	irq_work_queue(&tr->fsnotify_irqwork);
+}
+
+/*
+ * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+ *  defined(CONFIG_FSNOTIFY)
+ */
+#else
+
+#define trace_create_maxlat_file(tr, d_tracer)				\
+	trace_create_file("tracing_max_latency", 0644, d_tracer,	\
+			  &tr->max_latency, &tracing_max_lat_fops)
+
+#endif
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 /*
@@ -1536,6 +1607,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	/* record this tasks comm */
 	tracing_record_cmdline(tsk);
+	latency_fsnotify(tr);
 }
 
 /**
@@ -8594,8 +8666,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
 	create_trace_options_dir(tr);
 
 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
-	trace_create_file("tracing_max_latency", 0644, d_tracer,
-			&tr->max_latency, &tracing_max_lat_fops);
+	trace_create_maxlat_file(tr, d_tracer);
 #endif
 
 	if (ftrace_create_function_files(tr, d_tracer))
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b590f10bc72..718eb998c13e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -16,6 +16,8 @@
 #include <linux/trace_events.h>
 #include <linux/compiler.h>
 #include <linux/glob.h>
+#include <linux/irq_work.h>
+#include <linux/workqueue.h>
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #include <asm/unistd.h>		/* For NR_SYSCALLS	     */
@@ -264,6 +266,11 @@ struct trace_array {
 #endif
 #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
 	unsigned long		max_latency;
+#ifdef CONFIG_FSNOTIFY
+	struct dentry		*d_max_latency;
+	struct work_struct	fsnotify_work;
+	struct irq_work		fsnotify_irqwork;
+#endif
 #endif
 	struct trace_pid_list	__rcu *filtered_pids;
 	/*
@@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 #endif /* CONFIG_TRACER_MAX_TRACE */
 
+#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
+	defined(CONFIG_FSNOTIFY)
+
+void latency_fsnotify(struct trace_array *tr);
+
+#else
+
+static void latency_fsnotify(struct trace_array *tr) { }
+
+#endif
+
 #ifdef CONFIG_STACKTRACE
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		   int pc);
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 862f4b0139fc..63526670605a 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -237,6 +237,7 @@ static int get_sample(void)
 	/* If we exceed the threshold value, we have found a hardware latency */
 	if (sample > thresh || outer_sample > thresh) {
 		struct hwlat_sample s;
+		u64 latency;
 
 		ret = 1;
 
@@ -253,11 +254,13 @@ static int get_sample(void)
 		s.nmi_count = nmi_count;
 		trace_hwlat_sample(&s);
 
+		latency = max(sample, outer_sample);
+
 		/* Keep a running maximum ever recorded hardware latency */
-		if (sample > tr->max_latency)
-			tr->max_latency = sample;
-		if (outer_sample > tr->max_latency)
-			tr->max_latency = outer_sample;
+		if (latency > tr->max_latency) {
+			tr->max_latency = latency;
+			latency_fsnotify(tr);
+		}
 	}
 
 out:
-- 
cgit v1.2.3


From 793937236d1ee032d2ee5ccc27bdd280a04e766e Mon Sep 17 00:00:00 2001
From: "Viktor Rosendahl (BMW)" <viktor.rosendahl@gmail.com>
Date: Wed, 9 Oct 2019 00:08:22 +0200
Subject: preemptirq_delay_test: Add the burst feature and a sysfs trigger

This burst feature enables the user to generate a burst of
preempt/irqsoff latencies. This makes it possible to test whether we
are able to detect latencies that systematically occur very close to
each other.

The maximum burst size is 10. We also create 10 identical test
functions, so that we get 10 different backtraces; this is useful
when we want to test whether we can detect all the latencies in a
burst. Otherwise, there would be no easy way of differentiating
between which latency in a burst was captured by the tracer.

In addition, there is a sysfs trigger, so that it's not necessary to
reload the module to repeat the test. The trigger will appear as
/sys/kernel/preemptirq_delay_test/trigger in sysfs.

Link: http://lkml.kernel.org/r/20191008220824.7911-3-viktor.rosendahl@gmail.com

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Viktor Rosendahl (BMW) <viktor.rosendahl@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig                 |   6 +-
 kernel/trace/preemptirq_delay_test.c | 144 ++++++++++++++++++++++++++++++-----
 2 files changed, 128 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 624a05e99b0b..d25314bc7a1c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -760,9 +760,9 @@ config PREEMPTIRQ_DELAY_TEST
 	  configurable delay. The module busy waits for the duration of the
 	  critical section.
 
-	  For example, the following invocation forces a one-time irq-disabled
-	  critical section for 500us:
-	  modprobe preemptirq_delay_test test_mode=irq delay=500000
+	  For example, the following invocation generates a burst of three
+	  irq-disabled critical sections for 500us:
+	  modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3
 
 	  If unsure, say N
 
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index d8765c952fab..31c0fad4cb9e 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -10,18 +10,25 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
+#include <linux/kobject.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/string.h>
+#include <linux/sysfs.h>
 
 static ulong delay = 100;
-static char test_mode[10] = "irq";
+static char test_mode[12] = "irq";
+static uint burst_size = 1;
 
-module_param_named(delay, delay, ulong, S_IRUGO);
-module_param_string(test_mode, test_mode, 10, S_IRUGO);
-MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)");
-MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)");
+module_param_named(delay, delay, ulong, 0444);
+module_param_string(test_mode, test_mode, 12, 0444);
+module_param_named(burst_size, burst_size, uint, 0444);
+MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)");
+MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)");
+MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)");
+
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
 
 static void busy_wait(ulong time)
 {
@@ -34,37 +41,136 @@ static void busy_wait(ulong time)
 	} while ((end - start) < (time * 1000));
 }
 
-static int preemptirq_delay_run(void *data)
+static __always_inline void irqoff_test(void)
 {
 	unsigned long flags;
+	local_irq_save(flags);
+	busy_wait(delay);
+	local_irq_restore(flags);
+}
 
-	if (!strcmp(test_mode, "irq")) {
-		local_irq_save(flags);
-		busy_wait(delay);
-		local_irq_restore(flags);
-	} else if (!strcmp(test_mode, "preempt")) {
-		preempt_disable();
-		busy_wait(delay);
-		preempt_enable();
+static __always_inline void preemptoff_test(void)
+{
+	preempt_disable();
+	busy_wait(delay);
+	preempt_enable();
+}
+
+static void execute_preemptirqtest(int idx)
+{
+	if (!strcmp(test_mode, "irq"))
+		irqoff_test();
+	else if (!strcmp(test_mode, "preempt"))
+		preemptoff_test();
+	else if (!strcmp(test_mode, "alternate")) {
+		if (idx % 2 == 0)
+			irqoff_test();
+		else
+			preemptoff_test();
 	}
+}
+
+#define DECLARE_TESTFN(POSTFIX)				\
+	static void preemptirqtest_##POSTFIX(int idx)	\
+	{						\
+		execute_preemptirqtest(idx);		\
+	}						\
 
+/*
+ * We create 10 different functions, so that we can get 10 different
+ * backtraces.
+ */
+DECLARE_TESTFN(0)
+DECLARE_TESTFN(1)
+DECLARE_TESTFN(2)
+DECLARE_TESTFN(3)
+DECLARE_TESTFN(4)
+DECLARE_TESTFN(5)
+DECLARE_TESTFN(6)
+DECLARE_TESTFN(7)
+DECLARE_TESTFN(8)
+DECLARE_TESTFN(9)
+
+static void (*testfuncs[])(int)  = {
+	preemptirqtest_0,
+	preemptirqtest_1,
+	preemptirqtest_2,
+	preemptirqtest_3,
+	preemptirqtest_4,
+	preemptirqtest_5,
+	preemptirqtest_6,
+	preemptirqtest_7,
+	preemptirqtest_8,
+	preemptirqtest_9,
+};
+
+#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs)
+
+static int preemptirq_delay_run(void *data)
+{
+	int i;
+	int s = MIN(burst_size, NR_TEST_FUNCS);
+
+	for (i = 0; i < s; i++)
+		(testfuncs[i])(i);
 	return 0;
 }
 
-static int __init preemptirq_delay_init(void)
+static struct task_struct *preemptirq_start_test(void)
 {
 	char task_name[50];
-	struct task_struct *test_task;
 
 	snprintf(task_name, sizeof(task_name), "%s_test", test_mode);
+	return kthread_run(preemptirq_delay_run, NULL, task_name);
+}
+
+
+static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr,
+			 const char *buf, size_t count)
+{
+	preemptirq_start_test();
+	return count;
+}
+
+static struct kobj_attribute trigger_attribute =
+	__ATTR(trigger, 0200, NULL, trigger_store);
+
+static struct attribute *attrs[] = {
+	&trigger_attribute.attr,
+	NULL,
+};
+
+static struct attribute_group attr_group = {
+	.attrs = attrs,
+};
+
+static struct kobject *preemptirq_delay_kobj;
+
+static int __init preemptirq_delay_init(void)
+{
+	struct task_struct *test_task;
+	int retval;
+
+	test_task = preemptirq_start_test();
+	retval = PTR_ERR_OR_ZERO(test_task);
+	if (retval != 0)
+		return retval;
+
+	preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test",
+						       kernel_kobj);
+	if (!preemptirq_delay_kobj)
+		return -ENOMEM;
+
+	retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group);
+	if (retval)
+		kobject_put(preemptirq_delay_kobj);
 
-	test_task = kthread_run(preemptirq_delay_run, NULL, task_name);
-	return PTR_ERR_OR_ZERO(test_task);
+	return retval;
 }
 
 static void __exit preemptirq_delay_exit(void)
 {
-	return;
+	kobject_put(preemptirq_delay_kobj);
 }
 
 module_init(preemptirq_delay_init)
-- 
cgit v1.2.3


From 9c34fc4b7e903117cc27712b9e6c8690debb7e95 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 15 Oct 2019 21:18:20 +0200
Subject: tracing: Use CONFIG_PREEMPTION

CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT.
Both PREEMPT and PREEMPT_RT require the same functionality which today
depends on CONFIG_PREEMPT.

Add additional header output for PREEMPT_RT.
Link: http://lkml.kernel.org/r/20191015191821.11479-34-bigeasy@linutronix.de

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f093a433cb42..db7d06a26861 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3726,6 +3726,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		   "desktop",
 #elif defined(CONFIG_PREEMPT)
 		   "preempt",
+#elif defined(CONFIG_PREEMPT_RT)
+		   "preempt_rt",
 #else
 		   "unknown",
 #endif
-- 
cgit v1.2.3


From 6dff4d7dd3e0158688683a17dd792861aa9d61e2 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Tue, 15 Oct 2019 13:10:12 +0100
Subject: tracing: Make internal ftrace events static

The event_class_ftrace_##call and event_##call do not seem
to be used outside of trace_export.c so make them both static
to avoid a number of sparse warnings:

kernel/trace/trace_entries.h:59:1: warning: symbol 'event_class_ftrace_function' was not declared. Should it be static?
kernel/trace/trace_entries.h:59:1: warning: symbol '__event_function' was not declared. Should it be static?
kernel/trace/trace_entries.h:77:1: warning: symbol 'event_class_ftrace_funcgraph_entry' was not declared. Should it be static?
kernel/trace/trace_entries.h:77:1: warning: symbol '__event_funcgraph_entry' was not declared. Should it be static?
kernel/trace/trace_entries.h:93:1: warning: symbol 'event_class_ftrace_funcgraph_exit' was not declared. Should it be static?
kernel/trace/trace_entries.h:93:1: warning: symbol '__event_funcgraph_exit' was not declared. Should it be static?
kernel/trace/trace_entries.h:129:1: warning: symbol 'event_class_ftrace_context_switch' was not declared. Should it be static?
kernel/trace/trace_entries.h:129:1: warning: symbol '__event_context_switch' was not declared. Should it be static?
kernel/trace/trace_entries.h:149:1: warning: symbol 'event_class_ftrace_wakeup' was not declared. Should it be static?
kernel/trace/trace_entries.h:149:1: warning: symbol '__event_wakeup' was not declared. Should it be static?
kernel/trace/trace_entries.h:171:1: warning: symbol 'event_class_ftrace_kernel_stack' was not declared. Should it be static?
kernel/trace/trace_entries.h:171:1: warning: symbol '__event_kernel_stack' was not declared. Should it be static?
kernel/trace/trace_entries.h:191:1: warning: symbol 'event_class_ftrace_user_stack' was not declared. Should it be static?
kernel/trace/trace_entries.h:191:1: warning: symbol '__event_user_stack' was not declared. Should it be static?
kernel/trace/trace_entries.h:214:1: warning: symbol 'event_class_ftrace_bprint' was not declared. Should it be static?
kernel/trace/trace_entries.h:214:1: warning: symbol '__event_bprint' was not declared. Should it be static?
kernel/trace/trace_entries.h:230:1: warning: symbol 'event_class_ftrace_print' was not declared. Should it be static?
kernel/trace/trace_entries.h:230:1: warning: symbol '__event_print' was not declared. Should it be static?
kernel/trace/trace_entries.h:247:1: warning: symbol 'event_class_ftrace_raw_data' was not declared. Should it be static?
kernel/trace/trace_entries.h:247:1: warning: symbol '__event_raw_data' was not declared. Should it be static?
kernel/trace/trace_entries.h:262:1: warning: symbol 'event_class_ftrace_bputs' was not declared. Should it be static?
kernel/trace/trace_entries.h:262:1: warning: symbol '__event_bputs' was not declared. Should it be static?
kernel/trace/trace_entries.h:277:1: warning: symbol 'event_class_ftrace_mmiotrace_rw' was not declared. Should it be static?
kernel/trace/trace_entries.h:277:1: warning: symbol '__event_mmiotrace_rw' was not declared. Should it be static?
kernel/trace/trace_entries.h:298:1: warning: symbol 'event_class_ftrace_mmiotrace_map' was not declared. Should it be static?
kernel/trace/trace_entries.h:298:1: warning: symbol '__event_mmiotrace_map' was not declared. Should it be static?
kernel/trace/trace_entries.h:322:1: warning: symbol 'event_class_ftrace_branch' was not declared. Should it be static?
kernel/trace/trace_entries.h:322:1: warning: symbol '__event_branch' was not declared. Should it be static?
kernel/trace/trace_entries.h:343:1: warning: symbol 'event_class_ftrace_hwlat' was not declared. Should it be static?
kernel/trace/trace_entries.h:343:1: warning: symbol '__event_hwlat' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20191015121012.18824-1-ben.dooks@codethink.co.uk

Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_export.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 45630a76ed3a..2e6d2e9741cc 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call)	\
 #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
 			 regfn)						\
 									\
-struct trace_event_class __refdata event_class_ftrace_##call = {	\
+static struct trace_event_class __refdata event_class_ftrace_##call = {	\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.define_fields		= ftrace_define_fields_##call,		\
 	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
@@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = {				\
 	.print_fmt		= print,				\
 	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE,		\
 };									\
-struct trace_event_call __used						\
+static struct trace_event_call __used						\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 
 #undef FTRACE_ENTRY
-- 
cgit v1.2.3


From 2d6425af61166e026e7476db64f70f1266127b1d Mon Sep 17 00:00:00 2001
From: Divya Indi <divya.indi@oracle.com>
Date: Wed, 14 Aug 2019 10:55:23 -0700
Subject: tracing: Declare newly exported APIs in include/linux/trace.h

Declare the newly introduced and exported APIs in the header file -
include/linux/trace.h. Moving previous declarations from
kernel/trace/trace.h to include/linux/trace.h.

Link: http://lkml.kernel.org/r/1565805327-579-2-git-send-email-divya.indi@oracle.com

Signed-off-by: Divya Indi <divya.indi@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 718eb998c13e..90cba68c8b50 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,6 +11,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
+#include <linux/trace.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/trace_events.h>
@@ -873,8 +874,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
 trace_array_vprintk(struct trace_array *tr,
 		    unsigned long ip, const char *fmt, va_list args);
-int trace_array_printk(struct trace_array *tr,
-		       unsigned long ip, const char *fmt, ...);
 int trace_array_printk_buf(struct ring_buffer *buffer,
 			   unsigned long ip, const char *fmt, ...);
 void trace_printk_seq(struct trace_seq *s);
@@ -1890,7 +1889,6 @@ extern const char *__start___tracepoint_str[];
 extern const char *__stop___tracepoint_str[];
 
 void trace_printk_control(bool enabled);
-void trace_printk_init_buffers(void);
 void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
-- 
cgit v1.2.3


From e585e6469d6f476b82aa148dc44aaf7ae269a4e2 Mon Sep 17 00:00:00 2001
From: Divya Indi <divya.indi@oracle.com>
Date: Wed, 14 Aug 2019 10:55:24 -0700
Subject: tracing: Verify if trace array exists before destroying it.

A trace array can be destroyed from userspace or kernel. Verify if the
trace array exists before proceeding to destroy/remove it.

Link: http://lkml.kernel.org/r/1565805327-579-3-git-send-email-divya.indi@oracle.com

Reviewed-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Signed-off-by: Divya Indi <divya.indi@oracle.com>
[ Removed unneeded braces ]
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/module.c      |  6 +++++-
 kernel/trace/trace.c | 15 ++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ff2d7359a418..6e2fd40a6ed9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
 
 	module_enable_ro(mod, false);
 	module_enable_nx(mod);
-	module_enable_x(mod);
 
 	/* Mark state as coming so strong_try_module_get() ignores us,
 	 * but kallsyms etc. can see us. */
@@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod)
 	if (err)
 		return err;
 
+	/* Make module executable after ftrace is enabled */
+	mutex_lock(&module_mutex);
+	module_enable_x(mod);
+	mutex_unlock(&module_mutex);
+
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_COMING, mod);
 	return 0;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index db7d06a26861..fa4f742fc449 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8556,17 +8556,26 @@ static int __remove_instance(struct trace_array *tr)
 	return 0;
 }
 
-int trace_array_destroy(struct trace_array *tr)
+int trace_array_destroy(struct trace_array *this_tr)
 {
+	struct trace_array *tr;
 	int ret;
 
-	if (!tr)
+	if (!this_tr)
 		return -EINVAL;
 
 	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
-	ret = __remove_instance(tr);
+	ret = -ENODEV;
+
+	/* Making sure trace array exists before destroying it. */
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (tr == this_tr) {
+			ret = __remove_instance(tr);
+			break;
+		}
+	}
 
 	mutex_unlock(&trace_types_lock);
 	mutex_unlock(&event_mutex);
-- 
cgit v1.2.3


From 953ae45a0c25e09428d4a03d7654f97ab8a36647 Mon Sep 17 00:00:00 2001
From: Divya Indi <divya.indi@oracle.com>
Date: Wed, 14 Aug 2019 10:55:25 -0700
Subject: tracing: Adding NULL checks for trace_array descriptor pointer

As part of commit f45d1225adb0 ("tracing: Kernel access to Ftrace
instances") we exported certain functions. Here, we are adding some additional
NULL checks to ensure safe usage by users of these APIs.

Link: http://lkml.kernel.org/r/1565805327-579-4-git-send-email-divya.indi@oracle.com

Signed-off-by: Divya Indi <divya.indi@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 3 +++
 kernel/trace/trace_events.c | 2 ++
 2 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fa4f742fc449..79fe4d6ecbd8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3297,6 +3297,9 @@ int trace_array_printk(struct trace_array *tr,
 	if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
+	if (!tr)
+		return -ENOENT;
+
 	va_start(ap, fmt);
 	ret = trace_array_vprintk(tr, ip, fmt, ap);
 	va_end(ap);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fba87d10f0c1..2a3ac2365445 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 	char *event = NULL, *sub = NULL, *match;
 	int ret;
 
+	if (!tr)
+		return -ENOENT;
 	/*
 	 * The buf format can be <subsystem>:<event-name>
 	 *  *:<event-name> means any event by that name.
-- 
cgit v1.2.3


From b83b43ffc6e4b514ca034a0fbdee01322e2f7022 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 15 Oct 2019 09:00:55 -0400
Subject: fgraph: Fix function type mismatches of ftrace_graph_return using
 ftrace_stub

The C compiler is allowing more checks to make sure that function pointers
are assigned to the correct prototype function. Unfortunately, the function
graph tracer uses a special name with its assigned ftrace_graph_return
function pointer that maps to a stub function used by the function tracer
(ftrace_stub). The ftrace_graph_return variable is compared to the
ftrace_stub in some archs to know if the function graph tracer is enabled or
not. This means we can not just simply create a new function stub that
compares it without modifying all the archs.

Instead, have the linker script create a function_graph_stub that maps to
ftrace_stub, and this way we can define the prototype for it to match the
prototype of ftrace_graph_return, and make the compiler checks all happy!

Link: http://lkml.kernel.org/r/20191015090055.789a0aed@gandalf.local.home

Cc: linux-sh@vger.kernel.org
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc:  Rich Felker <dalias@libc.org>
Reported-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 7950a0356042..fa3ce10d0405 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 	return 0;
 }
 
+/*
+ * Simply points to ftrace_stub, but with the proper protocol.
+ * Defined by the linker script in linux/vmlinux.lds.h
+ */
+extern void ftrace_graph_stub(struct ftrace_graph_ret *);
+
 /* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return =
-			(trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub;
 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
 static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
 
@@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
 		goto out;
 
 	ftrace_graph_active--;
-	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+	ftrace_graph_return = ftrace_graph_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	__ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
-- 
cgit v1.2.3


From 80042c8f06bf5a7b87a63deaa3deb56f2cd52645 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Oct 2019 16:56:56 +0300
Subject: tracing: Use generic type for comparator function

Comparator function type, cmp_func_t, is defined in the types.h,
use it in the code.

Link: http://lkml.kernel.org/r/20191007135656.37734-3-andriy.shevchenko@linux.intel.com

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c       | 12 ++++++------
 kernel/trace/trace_branch.c |  8 ++++----
 kernel/trace/trace_stat.c   |  6 ++----
 kernel/trace/trace_stat.h   |  2 +-
 4 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d2d488c43a6a..82ef8d60a42b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -465,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 /* function graph compares on total time */
-static int function_stat_cmp(void *p1, void *p2)
+static int function_stat_cmp(const void *p1, const void *p2)
 {
-	struct ftrace_profile *a = p1;
-	struct ftrace_profile *b = p2;
+	const struct ftrace_profile *a = p1;
+	const struct ftrace_profile *b = p2;
 
 	if (a->time < b->time)
 		return -1;
@@ -479,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2)
 }
 #else
 /* not function graph compares against hits */
-static int function_stat_cmp(void *p1, void *p2)
+static int function_stat_cmp(const void *p1, const void *p2)
 {
-	struct ftrace_profile *a = p1;
-	struct ftrace_profile *b = p2;
+	const struct ftrace_profile *a = p1;
+	const struct ftrace_profile *b = p2;
 
 	if (a->counter < b->counter)
 		return -1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 3ea65cdff30d..88e158d27965 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m)
 	return 0;
 }
 
-static inline long get_incorrect_percent(struct ftrace_branch_data *p)
+static inline long get_incorrect_percent(const struct ftrace_branch_data *p)
 {
 	long percent;
 
@@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static int annotated_branch_stat_cmp(void *p1, void *p2)
+static int annotated_branch_stat_cmp(const void *p1, const void *p2)
 {
-	struct ftrace_branch_data *a = p1;
-	struct ftrace_branch_data *b = p2;
+	const struct ftrace_branch_data *a = p1;
+	const struct ftrace_branch_data *b = p2;
 
 	long percent_a, percent_b;
 
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 9ab0a1a7ad5e..874f1274cf99 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session)
 	kfree(session);
 }
 
-typedef int (*cmp_stat_t)(void *, void *);
-
-static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 	struct stat_node *data;
@@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
  * This one will force an insertion as right-most node
  * in the rbtree.
  */
-static int dummy_cmp(void *p1, void *p2)
+static int dummy_cmp(const void *p1, const void *p2)
 {
 	return -1;
 }
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 8786d17caf49..31d7dc5bf1db 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -16,7 +16,7 @@ struct tracer_stat {
 	void			*(*stat_start)(struct tracer_stat *trace);
 	void			*(*stat_next)(void *prev, int idx);
 	/* Compare two entries for stats sorting */
-	int			(*stat_cmp)(void *p1, void *p2);
+	cmp_func_t		stat_cmp;
 	/* Print a stat entry */
 	int			(*stat_show)(struct seq_file *s, void *p);
 	/* Release an entry */
-- 
cgit v1.2.3


From 0c3c86bdc691c794a6154f8515b7fa82c82dfc4d Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat (VMware)" <srivatsa@csail.mit.edu>
Date: Thu, 10 Oct 2019 11:51:17 -0700
Subject: tracing/hwlat: Fix a few trivial nits

Update the source file name in the comments, and fix a grammatical
error.

Link: http://lkml.kernel.org/r/157073346821.17189.8946944856026592247.stgit@srivatsa-ubuntu

Signed-off-by: Srivatsa S. Bhat (VMware) <srivatsa@csail.mit.edu>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_hwlat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 63526670605a..6638d63f0921 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * trace_hwlatdetect.c - A simple Hardware Latency detector.
+ * trace_hwlat.c - A simple Hardware Latency detector.
  *
  * Use this tracer to detect large system latencies induced by the behavior of
  * certain underlying system hardware or firmware, independent of Linux itself.
@@ -279,7 +279,7 @@ static void move_to_next_cpu(void)
 		return;
 	/*
 	 * If for some reason the user modifies the CPU affinity
-	 * of this thread, than stop migrating for the duration
+	 * of this thread, then stop migrating for the duration
 	 * of the current test.
 	 */
 	if (!cpumask_equal(current_mask, current->cpus_ptr))
-- 
cgit v1.2.3


From 6ee40511cb838f9ced002dff7131bca87e3ccbdd Mon Sep 17 00:00:00 2001
From: Yuming Han <yuming.han@unisoc.com>
Date: Thu, 24 Oct 2019 11:34:30 +0800
Subject: tracing: use kvcalloc for tgid_map array allocation

Fail to allocate memory for tgid_map, because it requires order-6 page.
detail as:

c3 sh: page allocation failure: order:6,
   mode:0x140c0c0(GFP_KERNEL), nodemask=(null)
c3 sh cpuset=/ mems_allowed=0
c3 CPU: 3 PID: 5632 Comm: sh Tainted: G        W  O    4.14.133+ #10
c3 Hardware name: Generic DT based system
c3 Backtrace:
c3 [<c010bdbc>] (dump_backtrace) from [<c010c08c>](show_stack+0x18/0x1c)
c3 [<c010c074>] (show_stack) from [<c0993c54>](dump_stack+0x84/0xa4)
c3 [<c0993bd0>] (dump_stack) from [<c0229858>](warn_alloc+0xc4/0x19c)
c3 [<c0229798>] (warn_alloc) from [<c022a6e4>](__alloc_pages_nodemask+0xd18/0xf28)
c3 [<c02299cc>] (__alloc_pages_nodemask) from [<c0248344>](kmalloc_order+0x20/0x38)
c3 [<c0248324>] (kmalloc_order) from [<c0248380>](kmalloc_order_trace+0x24/0x108)
c3 [<c024835c>] (kmalloc_order_trace) from [<c01e6078>](set_tracer_flag+0xb0/0x158)
c3 [<c01e5fc8>] (set_tracer_flag) from [<c01e6404>](trace_options_core_write+0x7c/0xcc)
c3 [<c01e6388>] (trace_options_core_write) from [<c0278b1c>](__vfs_write+0x40/0x14c)
c3 [<c0278adc>] (__vfs_write) from [<c0278e10>](vfs_write+0xc4/0x198)
c3 [<c0278d4c>] (vfs_write) from [<c027906c>](SyS_write+0x6c/0xd0)
c3 [<c0279000>] (SyS_write) from [<c01079a0>](ret_fast_syscall+0x0/0x54)

Switch to use kvcalloc to avoid unexpected allocation failures.

Link: http://lkml.kernel.org/r/1571888070-24425-1-git-send-email-chunyan.zhang@unisoc.com

Signed-off-by: Yuming Han <yuming.han@unisoc.com>
Signed-off-by: Chunyan Zhang <chunyan.zhang@unisoc.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 79fe4d6ecbd8..42659ce6ac0c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4686,7 +4686,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 
 	if (mask == TRACE_ITER_RECORD_TGID) {
 		if (!tgid_map)
-			tgid_map = kcalloc(PID_MAX_DEFAULT + 1,
+			tgid_map = kvcalloc(PID_MAX_DEFAULT + 1,
 					   sizeof(*tgid_map),
 					   GFP_KERNEL);
 		if (!tgid_map) {
-- 
cgit v1.2.3


From c7411a1a126f649be71526a36d4afac9e5aefa13 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 29 Oct 2019 17:31:44 +0900
Subject: tracing/kprobe: Check whether the non-suffixed symbol is notrace

Check whether the non-suffixed symbol is notrace, since suffixed
symbols are generated by the compilers for optimization. Based on
these suffixed symbols, notrace check might not work because
some of them are just a partial code of the original function.
(e.g. cold-cache (unlikely) code is separated from original
 function as FUNCTION.cold.XX)

For example, without this fix,
  # echo p device_add.cold.67 > /sys/kernel/debug/tracing/kprobe_events
  sh: write error: Invalid argument

  # cat /sys/kernel/debug/tracing/error_log
  [  135.491035] trace_kprobe: error: Failed to register probe event
    Command: p device_add.cold.67
               ^
  # dmesg | tail -n 1
  [  135.488599] trace_kprobe: Could not probe notrace function device_add.cold.67

With this,
  # echo p device_add.cold.66 > /sys/kernel/debug/tracing/kprobe_events
  # cat /sys/kernel/debug/kprobes/list
  ffffffff81599de9  k  device_add.cold.66+0x0    [DISABLED]

Actually, kprobe blacklist already did similar thing,
see within_kprobe_blacklist().

Link: http://lkml.kernel.org/r/157233790394.6706.18243942030937189679.stgit@devnote2

Fixes: 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function")
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1552a95c743b..7f890262c8a3 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call,
 
 #if defined(CONFIG_KPROBES_ON_FTRACE) && \
 	!defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE)
-static bool within_notrace_func(struct trace_kprobe *tk)
+static bool __within_notrace_func(unsigned long addr)
 {
-	unsigned long offset, size, addr;
+	unsigned long offset, size;
 
-	addr = trace_kprobe_address(tk);
 	if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset))
 		return false;
 
@@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk)
 	 */
 	return !ftrace_location_range(addr, addr + size - 1);
 }
+
+static bool within_notrace_func(struct trace_kprobe *tk)
+{
+	unsigned long addr = addr = trace_kprobe_address(tk);
+	char symname[KSYM_NAME_LEN], *p;
+
+	if (!__within_notrace_func(addr))
+		return false;
+
+	/* Check if the address is on a suffixed-symbol */
+	if (!lookup_symbol_name(addr, symname)) {
+		p = strchr(symname, '.');
+		if (!p)
+			return true;
+		*p = '\0';
+		addr = (unsigned long)kprobe_lookup_name(symname, 0);
+		if (addr)
+			return __within_notrace_func(addr);
+	}
+
+	return true;
+}
 #else
 #define within_notrace_func(tk)	(false)
 #endif
-- 
cgit v1.2.3


From ef56e047b2bd4dabb801fd073dfcab5f40de5f78 Mon Sep 17 00:00:00 2001
From: Piotr Maziarz <piotrx.maziarz@linux.intel.com>
Date: Thu, 7 Nov 2019 13:45:38 +0100
Subject: tracing: Use seq_buf_hex_dump() to dump buffers

Without this, buffers can be printed with __print_array macro that has
no formatting options and can be hard to read. The other way is to
mimic formatting capability with multiple calls of trace event with one
call per row which gives performance impact and different timestamp in
each row.

Link: http://lkml.kernel.org/r/1573130738-29390-2-git-send-email-piotrx.maziarz@linux.intel.com

Signed-off-by: Piotr Maziarz <piotrx.maziarz@linux.intel.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 15 +++++++++++++++
 kernel/trace/trace_seq.c    | 30 ++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d54ce252b05a..d9b4b7c22db4 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count,
 }
 EXPORT_SYMBOL(trace_print_array_seq);
 
+const char *
+trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str,
+			 int prefix_type, int rowsize, int groupsize,
+			 const void *buf, size_t len, bool ascii)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	trace_seq_putc(p, '\n');
+	trace_seq_hex_dump(p, prefix_str, prefix_type,
+			   rowsize, groupsize, buf, len, ascii);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+EXPORT_SYMBOL(trace_print_hex_dump_seq);
+
 int trace_raw_output_prep(struct trace_iterator *iter,
 			  struct trace_event *trace_event)
 {
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 6b1c562ffdaf..344e4c1aa09c 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
 	return seq_buf_to_user(&s->seq, ubuf, cnt);
 }
 EXPORT_SYMBOL_GPL(trace_seq_to_user);
+
+int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str,
+		       int prefix_type, int rowsize, int groupsize,
+		       const void *buf, size_t len, bool ascii)
+{
+		unsigned int save_len = s->seq.len;
+
+	if (s->full)
+		return 0;
+
+	__trace_seq_init(s);
+
+	if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+		s->full = 1;
+		return 0;
+	}
+
+	seq_buf_hex_dump(&(s->seq), prefix_str,
+		   prefix_type, rowsize, groupsize,
+		   buf, len, ascii);
+
+	if (unlikely(seq_buf_has_overflowed(&s->seq))) {
+		s->seq.len = save_len;
+		s->full = 1;
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(trace_seq_hex_dump);
-- 
cgit v1.2.3


From 9b4712044d059e7842aaeeafd7c7a7ee88c589db Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 12 Nov 2019 18:42:19 +0100
Subject: tracing: Remove stray tab in TRACE_EVAL_MAP_FILE's help text

There was a stray tab in the help text of the aforementioned config
option which showed like this:

  The "print fmt" of the trace events will show the enum/sizeof names
  instead        of their values. This can cause problems for user space tools
  ...

in menuconfig. Remove it and end a sentence with a fullstop.

No functional changes.

Link: http://lkml.kernel.org/r/20191112174219.10933-1-bp@alien8.de

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d25314bc7a1c..b872716bb2a0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -771,7 +771,7 @@ config TRACE_EVAL_MAP_FILE
        depends on TRACING
        help
 	The "print fmt" of the trace events will show the enum/sizeof names
-	instead	of their values. This can cause problems for user space tools
+	instead of their values. This can cause problems for user space tools
 	that use this string to parse the raw data as user space does not know
 	how to convert the string to its value.
 
@@ -792,7 +792,7 @@ config TRACE_EVAL_MAP_FILE
 	they are needed for the "eval_map" file. Enabling this option will
 	increase the memory footprint of the running kernel.
 
-	If unsure, say N
+	If unsure, say N.
 
 config GCOV_PROFILE_FTRACE
 	bool "Enable GCOV profiling on ftrace subsystem"
-- 
cgit v1.2.3


From 36b3615dc3b625c8b587f34e413a600f7ac16403 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 14 Nov 2019 22:43:58 -0500
Subject: tracing: Add missing "inline" in stub function of latency_fsnotify()

The latency_fsnotify() stub when the function is not defined, was missing
the "inline".

Link: https://lore.kernel.org/r/20191115140213.74c5efe7@canb.auug.org.au

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 90cba68c8b50..2df8aed6a8f0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -801,7 +801,7 @@ void latency_fsnotify(struct trace_array *tr);
 
 #else
 
-static void latency_fsnotify(struct trace_array *tr) { }
+static inline void latency_fsnotify(struct trace_array *tr) { }
 
 #endif
 
-- 
cgit v1.2.3


From 0567d6809182df53da03636fad36c507c5cf07a5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 14 Nov 2019 14:39:35 -0500
Subject: ftrace: Add modify_ftrace_direct()

Add a new function modify_ftrace_direct() that will allow a user to update
an existing direct caller to a new trampoline, without missing hits due to
unregistering one and then adding another.

Link: https://lore.kernel.org/r/20191109022907.6zzo6orhxpt5n2sv@ast-mbp.dhcp.thefacebook.com

Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 82ef8d60a42b..834f3556ea1e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5160,6 +5160,84 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
+
+static struct ftrace_ops stub_ops = {
+	.func		= ftrace_stub,
+};
+
+/**
+ * modify_ftrace_direct - Modify an existing direct call to call something else
+ * @ip: The instruction pointer to modify
+ * @old_addr: The address that the current @ip calls directly
+ * @new_addr: The address that the @ip should call
+ *
+ * This modifies a ftrace direct caller at an instruction pointer without
+ * having to disable it first. The direct call will switch over to the
+ * @new_addr without missing anything.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ *  -ENODEV : the @ip given has no direct caller attached
+ *  -EINVAL : the @old_addr does not match the current direct caller
+ */
+int modify_ftrace_direct(unsigned long ip,
+			 unsigned long old_addr, unsigned long new_addr)
+{
+	struct ftrace_func_entry *entry;
+	struct dyn_ftrace *rec;
+	int ret = -ENODEV;
+
+	mutex_lock(&direct_mutex);
+	entry = __ftrace_lookup_ip(direct_functions, ip);
+	if (!entry) {
+		/* OK if it is off by a little */
+		rec = lookup_rec(ip, ip);
+		if (!rec || rec->ip == ip)
+			goto out_unlock;
+
+		entry = __ftrace_lookup_ip(direct_functions, rec->ip);
+		if (!entry)
+			goto out_unlock;
+
+		ip = rec->ip;
+		WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
+	}
+
+	ret = -EINVAL;
+	if (entry->direct != old_addr)
+		goto out_unlock;
+
+	/*
+	 * By setting a stub function at the same address, we force
+	 * the code to call the iterator and the direct_ops helper.
+	 * This means that @ip does not call the direct call, and
+	 * we can simply modify it.
+	 */
+	ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
+	if (ret)
+		goto out_unlock;
+
+	ret = register_ftrace_function(&stub_ops);
+	if (ret) {
+		ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+		goto out_unlock;
+	}
+
+	entry->direct = new_addr;
+
+	/*
+	 * By removing the stub, we put back the direct call, calling
+	 * the @new_addr.
+	 */
+	unregister_ftrace_function(&stub_ops);
+	ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+
+	ret = 0;
+
+ out_unlock:
+	mutex_unlock(&direct_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 
 /**
-- 
cgit v1.2.3


From e9838bd51169af87ae248336d4c3fc59184a0e46 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Wed, 13 Nov 2019 18:12:01 +0100
Subject: irq_work: Fix IRQ_WORK_BUSY bit clearing

While attempting to clear the busy bit at the end of a work execution,
atomic_cmpxchg() expects the value of the flags with the pending bit
cleared as the old value. However by mistake the value of the flags is
passed without clearing the pending bit first.

As a result, clearing the busy bit fails and irq_work_sync() may stall:

 watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [blktrace:4948]
 CPU: 0 PID: 4948 Comm: blktrace Not tainted 5.4.0-rc7-00003-gfeb4a51323bab #1
 RIP: 0010:irq_work_sync+0x4/0x10
 Call Trace:
  relay_close_buf+0x19/0x50
  relay_close+0x64/0x100
  blk_trace_free+0x1f/0x50
  __blk_trace_remove+0x1e/0x30
  blk_trace_ioctl+0x11b/0x140
  blkdev_ioctl+0x6c1/0xa40
  block_ioctl+0x39/0x40
  do_vfs_ioctl+0xa5/0x700
  ksys_ioctl+0x70/0x80
  __x64_sys_ioctl+0x16/0x20
  do_syscall_64+0x5b/0x1d0
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

So clear the appropriate bit before passing the old flags to cmpxchg().

Fixes: feb4a51323ba ("irq_work: Slightly simplify IRQ_WORK_PENDING clearing")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Reported-by: Leonard Crestez <leonard.crestez@nxp.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Leonard Crestez <leonard.crestez@nxp.com>
Link: https://lkml.kernel.org/r/20191113171201.14032-1-frederic@kernel.org
---
 kernel/irq_work.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 49c53f80a13a..828cc30774bc 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -158,6 +158,7 @@ static void irq_work_run_list(struct llist_head *list)
 		 * Clear the BUSY bit and return to the free state if
 		 * no-one else claimed it meanwhile.
 		 */
+		flags &= ~IRQ_WORK_PENDING;
 		(void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
 	}
 }
-- 
cgit v1.2.3


From 20a15ee040f23bd553d4e6bbb1f8724ccd282abc Mon Sep 17 00:00:00 2001
From: luanshi <zhangliguang@linux.alibaba.com>
Date: Wed, 13 Nov 2019 22:41:33 +0800
Subject: genirq: Fix function documentation of __irq_alloc_descs()

The function got renamed at some point, but the kernel-doc was not updated.

Signed-off-by: Liguang Zhang <zhangliguang@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/1573656093-8643-1-git-send-email-zhangliguang@linux.alibaba.com
---
 kernel/irq/irqdesc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9be995fc3c5a..5b8fdd659e54 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -750,7 +750,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
 EXPORT_SYMBOL_GPL(irq_free_descs);
 
 /**
- * irq_alloc_descs - allocate and initialize a range of irq descriptors
+ * __irq_alloc_descs - allocate and initialize a range of irq descriptors
  * @irq:	Allocate for specific irq number if irq >= 0
  * @from:	Start the search from this irq number
  * @cnt:	Number of consecutive irqs to allocate.
-- 
cgit v1.2.3


From 3ca47e958a64b1116a2c35e65dcf467fc53d52de Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 23 Apr 2019 17:43:50 +0200
Subject: y2038: remove CONFIG_64BIT_TIME

The CONFIG_64BIT_TIME option is defined on all architectures, and can
be removed for simplicity now.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/hrtimer.c | 2 +-
 kernel/time/time.c    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 65605530ee34..9e20873148c6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1940,7 +1940,7 @@ out:
 	return ret;
 }
 
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
 
 SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
 		struct __kernel_timespec __user *, rmtp)
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 5c54ca632d08..96b8c02657ed 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -267,7 +267,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
 }
 #endif
 
-#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT)
+#ifdef CONFIG_64BIT
 SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
 {
 	struct __kernel_timex txc;		/* Local copy of parameter */
@@ -881,7 +881,7 @@ int get_timespec64(struct timespec64 *ts,
 	ts->tv_sec = kts.tv_sec;
 
 	/* Zero out the padding for 32 bit systems or in compat mode */
-	if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
+	if (in_compat_syscall())
 		kts.tv_nsec &= 0xFFFFFFFFUL;
 
 	ts->tv_nsec = kts.tv_nsec;
-- 
cgit v1.2.3


From 2a785996cc5e2fc1d1d29d196f530905f68d2dc2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 5 Nov 2019 11:10:01 +0100
Subject: y2038: uapi: change __kernel_time_t to __kernel_old_time_t

This is mainly a patch for clarification, and to let us remove
the time_t definition from the kernel to prevent new users from
creeping in that might not be y2038-safe.

All remaining uses of 'time_t' or '__kernel_time_t' are part of
the user API that cannot be changed by that either have a
replacement or that do not suffer from the y2038 overflow.

Acked-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/time.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index 96b8c02657ed..833abae3364f 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz);
  * why not move it into the appropriate arch directory (for those
  * architectures that need it).
  */
-SYSCALL_DEFINE1(time, time_t __user *, tloc)
+SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
 {
-	time_t i = (time_t)ktime_get_real_seconds();
+	__kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();
 
 	if (tloc) {
 		if (put_user(i,tloc))
@@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc)
  * architectures that need it).
  */
 
-SYSCALL_DEFINE1(stime, time_t __user *, tptr)
+SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
 {
 	struct timespec64 tv;
 	int err;
-- 
cgit v1.2.3


From bdd565f817a74b9e30edec108f7cb1dbc762b8a6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Oct 2019 22:46:48 +0200
Subject: y2038: rusage: use __kernel_old_timeval

There are two 'struct timeval' fields in 'struct rusage'.

Unfortunately the definition of timeval is now ambiguous when used in
user space with a libc that has a 64-bit time_t, and this also changes
the 'rusage' definition in user space in a way that is incompatible with
the system call interface.

While there is no good solution to avoid all ambiguity here, change
the definition in the kernel headers to be compatible with the kernel
ABI, using __kernel_old_timeval as an unambiguous base type.

In previous discussions, there was also a plan to add a replacement
for rusage based on 64-bit timestamps and nanosecond resolution,
i.e. 'struct __kernel_timespec'. I have patches for that as well,
if anyone thinks we should do that.

Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/sys.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index a611d1d58c7d..d3aef31e24dc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1763,8 +1763,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
 	unlock_task_sighand(p, &flags);
 
 out:
-	r->ru_utime = ns_to_timeval(utime);
-	r->ru_stime = ns_to_timeval(stime);
+	r->ru_utime = ns_to_kernel_old_timeval(utime);
+	r->ru_stime = ns_to_kernel_old_timeval(stime);
 
 	if (who != RUSAGE_CHILDREN) {
 		struct mm_struct *mm = get_task_mm(p);
-- 
cgit v1.2.3


From 75d319c06e6a76f67549c0ae1007dc3167804f4e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Oct 2019 22:56:17 +0200
Subject: y2038: syscalls: change remaining timeval to __kernel_old_timeval

All of the remaining syscalls that pass a timeval (gettimeofday, utime,
futimesat) can trivially be changed to pass a __kernel_old_timeval
instead, which has a compatible layout, but avoids ambiguity with
the timeval type in user space.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/power/power.h | 2 +-
 kernel/time/time.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 44bee462ff57..7cdc64dc2373 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,7 +179,7 @@ extern void swsusp_close(fmode_t);
 extern int swsusp_unmark(void);
 #endif
 
-struct timeval;
+struct __kernel_old_timeval;
 /* kernel/power/swsusp.c */
 extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
 
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 833abae3364f..a0e7b9909f2d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
 #endif /* __ARCH_WANT_SYS_TIME32 */
 #endif
 
-SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
+SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
 		struct timezone __user *, tz)
 {
 	if (likely(tv != NULL)) {
-- 
cgit v1.2.3


From 5e0fb1b57bea8d11fe77da2bc80f4c9a67e28318 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 15 Aug 2018 20:04:11 +0200
Subject: y2038: time: avoid timespec usage in settimeofday()

The compat_get_timeval() and timeval_valid() interfaces are deprecated
and getting removed along with the definition of struct timeval itself.

Change the two implementations of the settimeofday() system call to
open-code these helpers and completely avoid references to timeval.

The timeval_valid() call is not needed any more here, only a check to
avoid overflowing tv_nsec during the multiplication, as there is another
range check in do_sys_settimeofday64().

Tested-by: syzbot+dccce9b26ba09ca49966@syzkaller.appspotmail.com
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/time.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index a0e7b9909f2d..58e312e7380f 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 	return 0;
 }
 
-SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
+SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
 		struct timezone __user *, tz)
 {
 	struct timespec64 new_ts;
-	struct timeval user_tv;
 	struct timezone new_tz;
 
 	if (tv) {
-		if (copy_from_user(&user_tv, tv, sizeof(*tv)))
+		if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
+		    get_user(new_ts.tv_nsec, &tv->tv_usec))
 			return -EFAULT;
 
-		if (!timeval_valid(&user_tv))
+		if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
 			return -EINVAL;
 
-		new_ts.tv_sec = user_tv.tv_sec;
-		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+		new_ts.tv_nsec *= NSEC_PER_USEC;
 	}
 	if (tz) {
 		if (copy_from_user(&new_tz, tz, sizeof(*tz)))
@@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
 		       struct timezone __user *, tz)
 {
 	struct timespec64 new_ts;
-	struct timeval user_tv;
 	struct timezone new_tz;
 
 	if (tv) {
-		if (compat_get_timeval(&user_tv, tv))
+		if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
+		    get_user(new_ts.tv_nsec, &tv->tv_usec))
 			return -EFAULT;
 
-		if (!timeval_valid(&user_tv))
+		if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
 			return -EINVAL;
 
-		new_ts.tv_sec = user_tv.tv_sec;
-		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
+		new_ts.tv_nsec *= NSEC_PER_USEC;
 	}
 	if (tz) {
 		if (copy_from_user(&new_tz, tz, sizeof(*tz)))
-- 
cgit v1.2.3


From c1745f84be2657f5702388133551b759b9237f59 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Oct 2019 10:46:22 +0200
Subject: y2038: itimer: compat handling to itimer.c

The structure is only used in one place, moving it there simplifies the
interface and helps with later changes to this code.

Rename it to match the other time32 structures in the process.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/compat.c      | 24 ------------------------
 kernel/time/itimer.c | 42 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index a2bc1d6ceb57..95005f849c68 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -90,30 +90,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
 }
 EXPORT_SYMBOL_GPL(compat_put_timespec);
 
-int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i)
-{
-	struct compat_itimerval v32;
-
-	if (copy_from_user(&v32, i, sizeof(struct compat_itimerval)))
-		return -EFAULT;
-	o->it_interval.tv_sec = v32.it_interval.tv_sec;
-	o->it_interval.tv_usec = v32.it_interval.tv_usec;
-	o->it_value.tv_sec = v32.it_value.tv_sec;
-	o->it_value.tv_usec = v32.it_value.tv_usec;
-	return 0;
-}
-
-int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i)
-{
-	struct compat_itimerval v32;
-
-	v32.it_interval.tv_sec = i->it_interval.tv_sec;
-	v32.it_interval.tv_usec = i->it_interval.tv_usec;
-	v32.it_value.tv_sec = i->it_value.tv_sec;
-	v32.it_value.tv_usec = i->it_value.tv_usec;
-	return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
-}
-
 #ifdef __ARCH_WANT_SYS_SIGPROCMASK
 
 /*
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 77f1e5635cc1..c52ebb40b60b 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -112,19 +112,34 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
 }
 
 #ifdef CONFIG_COMPAT
+struct old_itimerval32 {
+	struct old_timeval32	it_interval;
+	struct old_timeval32	it_value;
+};
+
+static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i)
+{
+	struct old_itimerval32 v32;
+
+	v32.it_interval.tv_sec = i->it_interval.tv_sec;
+	v32.it_interval.tv_usec = i->it_interval.tv_usec;
+	v32.it_value.tv_sec = i->it_value.tv_sec;
+	v32.it_value.tv_usec = i->it_value.tv_usec;
+	return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0;
+}
+
 COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
-		       struct compat_itimerval __user *, it)
+		       struct old_itimerval32 __user *, it)
 {
 	struct itimerval kit;
 	int error = do_getitimer(which, &kit);
 
-	if (!error && put_compat_itimerval(it, &kit))
+	if (!error && put_old_itimerval32(it, &kit))
 		error = -EFAULT;
 	return error;
 }
 #endif
 
-
 /*
  * The timer is automagically restarted, when interval != 0
  */
@@ -310,15 +325,28 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 }
 
 #ifdef CONFIG_COMPAT
+static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i)
+{
+	struct old_itimerval32 v32;
+
+	if (copy_from_user(&v32, i, sizeof(struct old_itimerval32)))
+		return -EFAULT;
+	o->it_interval.tv_sec = v32.it_interval.tv_sec;
+	o->it_interval.tv_usec = v32.it_interval.tv_usec;
+	o->it_value.tv_sec = v32.it_value.tv_sec;
+	o->it_value.tv_usec = v32.it_value.tv_usec;
+	return 0;
+}
+
 COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
-		       struct compat_itimerval __user *, in,
-		       struct compat_itimerval __user *, out)
+		       struct old_itimerval32 __user *, in,
+		       struct old_itimerval32 __user *, out)
 {
 	struct itimerval kin, kout;
 	int error;
 
 	if (in) {
-		if (get_compat_itimerval(&kin, in))
+		if (get_old_itimerval32(&kin, in))
 			return -EFAULT;
 	} else {
 		memset(&kin, 0, sizeof(kin));
@@ -327,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
 	error = do_setitimer(which, &kin, out ? &kout : NULL);
 	if (error || !out)
 		return error;
-	if (put_compat_itimerval(out, &kout))
+	if (put_old_itimerval32(out, &kout))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit v1.2.3


From 4c22ea2b91203564fdf392b3d3cae249b652a8ae Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Oct 2019 16:59:39 +0200
Subject: y2038: use compat_{get,set}_itimer on alpha

The itimer handling for the old alpha osf_setitimer/osf_getitimer
system calls is identical to the compat version of getitimer/setitimer,
so just use those directly.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/itimer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index c52ebb40b60b..4664c6addf69 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -111,7 +111,7 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
 	return error;
 }
 
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
 struct old_itimerval32 {
 	struct old_timeval32	it_interval;
 	struct old_timeval32	it_value;
@@ -324,7 +324,7 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 	return 0;
 }
 
-#ifdef CONFIG_COMPAT
+#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
 static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i)
 {
 	struct old_itimerval32 v32;
-- 
cgit v1.2.3


From ddbc7d0657e9fd38b69f16bd0310703367b52d29 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Oct 2019 21:37:43 +0200
Subject: y2038: move itimer reset into itimer.c

Preparing for a change to the itimer internals, stop using the
do_setitimer() symbol and instead use a new higher-level interface.

The do_getitimer()/do_setitimer functions can now be made static,
allowing the compiler to potentially produce better object code.

Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/itimer.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 4664c6addf69..ce9cd19ce72e 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -73,7 +73,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	value->it_interval = ns_to_timeval(interval);
 }
 
-int do_getitimer(int which, struct itimerval *value)
+static int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
 
@@ -197,7 +197,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 #define timeval_valid(t) \
 	(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
 
-int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
 	struct hrtimer *timer;
@@ -249,6 +249,17 @@ again:
 	return 0;
 }
 
+#ifdef CONFIG_SECURITY_SELINUX
+void clear_itimer(void)
+{
+	struct itimerval v = {};
+	int i;
+
+	for (i = 0; i < 3; i++)
+		do_setitimer(i, &v, NULL);
+}
+#endif
+
 #ifdef __ARCH_WANT_SYS_ALARM
 
 /**
-- 
cgit v1.2.3


From bd40a175769d411b2a37e1c087082ac7ee2c15bb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 7 Nov 2019 15:27:39 +0100
Subject: y2038: itimer: change implementation to timespec64

There is no 64-bit version of getitimer/setitimer since that is not
actually needed. However, the implementation is built around the
deprecated 'struct timeval' type.

Change the code to use timespec64 internally to reduce the dependencies
on timeval and associated helper functions.

Minor adjustments in the code are needed to make the native and compat
version work the same way, and to keep the range check working after
the conversion.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/itimer.c | 158 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 96 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index ce9cd19ce72e..5872db9bd5f7 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -26,7 +26,7 @@
  * Returns the delta between the expiry time and now, which can be
  * less than zero or 1usec for an pending expired timer
  */
-static struct timeval itimer_get_remtime(struct hrtimer *timer)
+static struct timespec64 itimer_get_remtime(struct hrtimer *timer)
 {
 	ktime_t rem = __hrtimer_get_remaining(timer, true);
 
@@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer)
 	} else
 		rem = 0;
 
-	return ktime_to_timeval(rem);
+	return ktime_to_timespec64(rem);
 }
 
 static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
-			   struct itimerval *const value)
+			   struct itimerspec64 *const value)
 {
 	u64 val, interval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -69,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 
 	spin_unlock_irq(&tsk->sighand->siglock);
 
-	value->it_value = ns_to_timeval(val);
-	value->it_interval = ns_to_timeval(interval);
+	value->it_value = ns_to_timespec64(val);
+	value->it_interval = ns_to_timespec64(interval);
 }
 
-static int do_getitimer(int which, struct itimerval *value)
+static int do_getitimer(int which, struct itimerspec64 *value)
 {
 	struct task_struct *tsk = current;
 
@@ -82,7 +82,7 @@ static int do_getitimer(int which, struct itimerval *value)
 		spin_lock_irq(&tsk->sighand->siglock);
 		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
 		value->it_interval =
-			ktime_to_timeval(tsk->signal->it_real_incr);
+			ktime_to_timespec64(tsk->signal->it_real_incr);
 		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
@@ -97,17 +97,26 @@ static int do_getitimer(int which, struct itimerval *value)
 	return 0;
 }
 
+static int put_itimerval(struct itimerval __user *o,
+			 const struct itimerspec64 *i)
+{
+	struct itimerval v;
+
+	v.it_interval.tv_sec = i->it_interval.tv_sec;
+	v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
+	v.it_value.tv_sec = i->it_value.tv_sec;
+	v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
+	return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0;
+}
+
+
 SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value)
 {
-	int error = -EFAULT;
-	struct itimerval get_buffer;
+	struct itimerspec64 get_buffer;
+	int error = do_getitimer(which, &get_buffer);
 
-	if (value) {
-		error = do_getitimer(which, &get_buffer);
-		if (!error &&
-		    copy_to_user(value, &get_buffer, sizeof(get_buffer)))
-			error = -EFAULT;
-	}
+	if (!error && put_itimerval(value, &get_buffer))
+		error = -EFAULT;
 	return error;
 }
 
@@ -117,24 +126,25 @@ struct old_itimerval32 {
 	struct old_timeval32	it_value;
 };
 
-static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i)
+static int put_old_itimerval32(struct old_itimerval32 __user *o,
+			       const struct itimerspec64 *i)
 {
 	struct old_itimerval32 v32;
 
 	v32.it_interval.tv_sec = i->it_interval.tv_sec;
-	v32.it_interval.tv_usec = i->it_interval.tv_usec;
+	v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC;
 	v32.it_value.tv_sec = i->it_value.tv_sec;
-	v32.it_value.tv_usec = i->it_value.tv_usec;
+	v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC;
 	return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0;
 }
 
 COMPAT_SYSCALL_DEFINE2(getitimer, int, which,
-		       struct old_itimerval32 __user *, it)
+		       struct old_itimerval32 __user *, value)
 {
-	struct itimerval kit;
-	int error = do_getitimer(which, &kit);
+	struct itimerspec64 get_buffer;
+	int error = do_getitimer(which, &get_buffer);
 
-	if (!error && put_old_itimerval32(it, &kit))
+	if (!error && put_old_itimerval32(value, &get_buffer))
 		error = -EFAULT;
 	return error;
 }
@@ -156,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 }
 
 static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
-			   const struct itimerval *const value,
-			   struct itimerval *const ovalue)
+			   const struct itimerspec64 *const value,
+			   struct itimerspec64 *const ovalue)
 {
 	u64 oval, nval, ointerval, ninterval;
 	struct cpu_itimer *it = &tsk->signal->it[clock_id];
@@ -166,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	 * Use the to_ktime conversion because that clamps the maximum
 	 * value to KTIME_MAX and avoid multiplication overflows.
 	 */
-	nval = ktime_to_ns(timeval_to_ktime(value->it_value));
-	ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval));
+	nval = timespec64_to_ns(&value->it_value);
+	ninterval = timespec64_to_ns(&value->it_interval);
 
 	spin_lock_irq(&tsk->sighand->siglock);
 
@@ -186,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 	spin_unlock_irq(&tsk->sighand->siglock);
 
 	if (ovalue) {
-		ovalue->it_value = ns_to_timeval(oval);
-		ovalue->it_interval = ns_to_timeval(ointerval);
+		ovalue->it_value = ns_to_timespec64(oval);
+		ovalue->it_interval = ns_to_timespec64(ointerval);
 	}
 }
 
@@ -197,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
 #define timeval_valid(t) \
 	(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
 
-static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
+static int do_setitimer(int which, struct itimerspec64 *value,
+			struct itimerspec64 *ovalue)
 {
 	struct task_struct *tsk = current;
 	struct hrtimer *timer;
 	ktime_t expires;
 
-	/*
-	 * Validate the timevals in value.
-	 */
-	if (!timeval_valid(&value->it_value) ||
-	    !timeval_valid(&value->it_interval))
-		return -EINVAL;
-
 	switch (which) {
 	case ITIMER_REAL:
 again:
@@ -218,7 +222,7 @@ again:
 		if (ovalue) {
 			ovalue->it_value = itimer_get_remtime(timer);
 			ovalue->it_interval
-				= ktime_to_timeval(tsk->signal->it_real_incr);
+				= ktime_to_timespec64(tsk->signal->it_real_incr);
 		}
 		/* We are sharing ->siglock with it_real_fn() */
 		if (hrtimer_try_to_cancel(timer) < 0) {
@@ -226,10 +230,10 @@ again:
 			hrtimer_cancel_wait_running(timer);
 			goto again;
 		}
-		expires = timeval_to_ktime(value->it_value);
+		expires = timespec64_to_ktime(value->it_value);
 		if (expires != 0) {
 			tsk->signal->it_real_incr =
-				timeval_to_ktime(value->it_interval);
+				timespec64_to_ktime(value->it_interval);
 			hrtimer_start(timer, expires, HRTIMER_MODE_REL);
 		} else
 			tsk->signal->it_real_incr = 0;
@@ -252,7 +256,7 @@ again:
 #ifdef CONFIG_SECURITY_SELINUX
 void clear_itimer(void)
 {
-	struct itimerval v = {};
+	struct itimerspec64 v = {};
 	int i;
 
 	for (i = 0; i < 3; i++)
@@ -276,15 +280,15 @@ void clear_itimer(void)
  */
 static unsigned int alarm_setitimer(unsigned int seconds)
 {
-	struct itimerval it_new, it_old;
+	struct itimerspec64 it_new, it_old;
 
 #if BITS_PER_LONG < 64
 	if (seconds > INT_MAX)
 		seconds = INT_MAX;
 #endif
 	it_new.it_value.tv_sec = seconds;
-	it_new.it_value.tv_usec = 0;
-	it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
+	it_new.it_value.tv_nsec = 0;
+	it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0;
 
 	do_setitimer(ITIMER_REAL, &it_new, &it_old);
 
@@ -292,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds)
 	 * We can't return 0 if we have an alarm pending ...  And we'd
 	 * better return too much than too little anyway
 	 */
-	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) ||
-	      it_old.it_value.tv_usec >= 500000)
+	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) ||
+	      it_old.it_value.tv_nsec >= 500000)
 		it_old.it_value.tv_sec++;
 
 	return it_old.it_value.tv_sec;
@@ -310,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
 
 #endif
 
+static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i)
+{
+	struct itimerval v;
+
+	if (copy_from_user(&v, i, sizeof(struct itimerval)))
+		return -EFAULT;
+
+	/* Validate the timevals in value. */
+	if (!timeval_valid(&v.it_value) ||
+	    !timeval_valid(&v.it_interval))
+		return -EINVAL;
+
+	o->it_interval.tv_sec = v.it_interval.tv_sec;
+	o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC;
+	o->it_value.tv_sec = v.it_value.tv_sec;
+	o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC;
+	return 0;
+}
+
 SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 		struct itimerval __user *, ovalue)
 {
-	struct itimerval set_buffer, get_buffer;
+	struct itimerspec64 set_buffer, get_buffer;
 	int error;
 
 	if (value) {
-		if(copy_from_user(&set_buffer, value, sizeof(set_buffer)))
-			return -EFAULT;
+		error = get_itimerval(&set_buffer, value);
+		if (error)
+			return error;
 	} else {
 		memset(&set_buffer, 0, sizeof(set_buffer));
 		printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
@@ -330,43 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 	if (error || !ovalue)
 		return error;
 
-	if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer)))
+	if (put_itimerval(ovalue, &get_buffer))
 		return -EFAULT;
 	return 0;
 }
 
 #if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA)
-static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i)
+static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i)
 {
 	struct old_itimerval32 v32;
 
 	if (copy_from_user(&v32, i, sizeof(struct old_itimerval32)))
 		return -EFAULT;
+
+	/* Validate the timevals in value.  */
+	if (!timeval_valid(&v32.it_value) ||
+	    !timeval_valid(&v32.it_interval))
+		return -EINVAL;
+
 	o->it_interval.tv_sec = v32.it_interval.tv_sec;
-	o->it_interval.tv_usec = v32.it_interval.tv_usec;
+	o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC;
 	o->it_value.tv_sec = v32.it_value.tv_sec;
-	o->it_value.tv_usec = v32.it_value.tv_usec;
+	o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC;
 	return 0;
 }
 
 COMPAT_SYSCALL_DEFINE3(setitimer, int, which,
-		       struct old_itimerval32 __user *, in,
-		       struct old_itimerval32 __user *, out)
+		       struct old_itimerval32 __user *, value,
+		       struct old_itimerval32 __user *, ovalue)
 {
-	struct itimerval kin, kout;
+	struct itimerspec64 set_buffer, get_buffer;
 	int error;
 
-	if (in) {
-		if (get_old_itimerval32(&kin, in))
-			return -EFAULT;
+	if (value) {
+		error = get_old_itimerval32(&set_buffer, value);
+		if (error)
+			return error;
 	} else {
-		memset(&kin, 0, sizeof(kin));
+		memset(&set_buffer, 0, sizeof(set_buffer));
+		printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+			    " Misfeature support will be removed\n",
+			    current->comm);
 	}
 
-	error = do_setitimer(which, &kin, out ? &kout : NULL);
-	if (error || !out)
+	error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
+	if (error || !ovalue)
 		return error;
-	if (put_old_itimerval32(out, &kout))
+	if (put_old_itimerval32(ovalue, &get_buffer))
 		return -EFAULT;
 	return 0;
 }
-- 
cgit v1.2.3


From 942437c97fd9ff23a17c13118f50bd0490f6868c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 15 Jul 2019 11:46:10 +0200
Subject: y2038: allow disabling time32 system calls

At the moment, the compilation of the old time32 system calls depends
purely on the architecture. As systems with new libc based on 64-bit
time_t are getting deployed, even architectures that previously supported
these (notably x86-32 and arm32 but also many others) no longer depend on
them, and removing them from a kernel image results in a smaller kernel
binary, the same way we can leave out many other optional system calls.

More importantly, on an embedded system that needs to keep working
beyond year 2038, any user space program calling these system calls
is likely a bug, so removing them from the kernel image does provide
an extra debugging help for finding broken applications.

I've gone back and forth on hiding this option unless CONFIG_EXPERT
is set. This version leaves it visible based on the logic that
eventually it will be turned off indefinitely.

Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/sys_ni.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 34b76895b81e..3b69a560a7ac 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -410,6 +410,29 @@ COND_SYSCALL(send);
 COND_SYSCALL(bdflush);
 COND_SYSCALL(uselib);
 
+/* optional: time32 */
+COND_SYSCALL(time32);
+COND_SYSCALL(stime32);
+COND_SYSCALL(utime32);
+COND_SYSCALL(adjtimex_time32);
+COND_SYSCALL(sched_rr_get_interval_time32);
+COND_SYSCALL(nanosleep_time32);
+COND_SYSCALL(rt_sigtimedwait_time32);
+COND_SYSCALL_COMPAT(rt_sigtimedwait_time32);
+COND_SYSCALL(timer_settime32);
+COND_SYSCALL(timer_gettime32);
+COND_SYSCALL(clock_settime32);
+COND_SYSCALL(clock_gettime32);
+COND_SYSCALL(clock_getres_time32);
+COND_SYSCALL(clock_nanosleep_time32);
+COND_SYSCALL(utimes_time32);
+COND_SYSCALL(futimesat_time32);
+COND_SYSCALL(pselect6_time32);
+COND_SYSCALL_COMPAT(pselect6_time32);
+COND_SYSCALL(ppoll_time32);
+COND_SYSCALL_COMPAT(ppoll_time32);
+COND_SYSCALL(utimensat_time32);
+COND_SYSCALL(clock_adjtime32);
 
 /*
  * The syscalls below are not found in include/uapi/asm-generic/unistd.h
-- 
cgit v1.2.3


From 58a74a2925a5b4125dd4f4e728490b9642534c81 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Fri, 15 Nov 2019 11:17:30 +0200
Subject: tracing: Increase SYNTH_FIELDS_MAX for synthetic_events

Increase the maximum allowed count of synthetic event fields from 16 to 32
in order to allow for larger-than-usual events.

Link: http://lkml.kernel.org/r/20191115091730.9192-1-dedekind1@gmail.com

Reviewed-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 7482a1466ebf..f49d1a36d3ae 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -23,7 +23,7 @@
 #include "trace_dynevent.h"
 
 #define SYNTH_SYSTEM		"synthetic"
-#define SYNTH_FIELDS_MAX	16
+#define SYNTH_FIELDS_MAX	32
 
 #define STR_VAR_LEN_MAX		32 /* must be multiple of sizeof(u64) */
 
-- 
cgit v1.2.3


From 1c7f9b673dc0a15753274c4e7f5ebfd4468fc69f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 15 Nov 2019 14:13:20 -0500
Subject: ftrace: Fix accounting bug with direct->count in
 register_ftrace_direct()

The direct->count wasn't being updated properly, where it only was updated
when the first entry was added, but should be updated every time.

Fixes: 013bf0da04748 ("ftrace: Add ftrace_find_direct_func()")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 834f3556ea1e..32e4e5ffdd97 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5093,8 +5093,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 			ftrace_direct_func_count--;
 		}
 	} else {
-		if (!direct->count)
-			direct->count++;
+		direct->count++;
 	}
  out_unlock:
 	mutex_unlock(&direct_mutex);
-- 
cgit v1.2.3


From 406acdd32d3e7d5a6dcb7f67798e89068fbe0d77 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 15 Nov 2019 14:19:04 -0500
Subject: ftrace: Add another check for match in register_ftrace_direct()

As an instruction pointer passed into register_ftrace_direct() may just
exist on the ftrace call site, but may not be the start of the call site
itself, register_ftrace_direct() still needs to update test if a direct call
exists on the normalized site, as only one direct call is allowed at any one
time.

Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()")
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 32e4e5ffdd97..9fe33ebaf914 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5030,7 +5030,12 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 		goto out_unlock;
 
 	/* Make sure the ip points to the exact record */
-	ip = rec->ip;
+	if (ip != rec->ip) {
+		ip = rec->ip;
+		/* Need to check this ip for a direct. */
+		if (find_rec_direct(ip))
+			goto out_unlock;
+	}
 
 	ret = -ENOMEM;
 	if (ftrace_hash_empty(direct_functions) ||
-- 
cgit v1.2.3


From 128161f47bc3797b0d068da13e311770685d6e4f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 15 Nov 2019 14:14:45 -0500
Subject: ftrace: Add helper find_direct_entry() to consolidate code

Both unregister_ftrace_direct() and modify_ftrace_direct() needs to
normalize the ip passed in to match the rec->ip, as it is acceptable to have
the ip on the ftrace call site but not the start. There are also common
validity checks with the record found by the ip, these should be done for
both unregister_ftrace_direct() and modify_ftrace_direct().

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++--------------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9fe33ebaf914..ef79c8393f53 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5112,30 +5112,40 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(register_ftrace_direct);
 
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+static struct ftrace_func_entry *find_direct_entry(unsigned long *ip)
 {
 	struct ftrace_func_entry *entry;
-	struct ftrace_direct_func *direct;
 	struct dyn_ftrace *rec;
-	int ret = -ENODEV;
 
-	mutex_lock(&direct_mutex);
+	rec = lookup_rec(*ip, *ip);
+	if (!rec)
+		return NULL;
 
-	entry = __ftrace_lookup_ip(direct_functions, ip);
+	entry = __ftrace_lookup_ip(direct_functions, rec->ip);
 	if (!entry) {
-		/* OK if it is off by a little */
-		rec = lookup_rec(ip, ip);
-		if (!rec || rec->ip == ip)
-			goto out_unlock;
+		WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+		return NULL;
+	}
 
-		entry = __ftrace_lookup_ip(direct_functions, rec->ip);
-		if (!entry) {
-			WARN_ON(rec->flags & FTRACE_FL_DIRECT);
-			goto out_unlock;
-		}
+	WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
 
-		WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
-	}
+	/* Passed in ip just needs to be on the call site */
+	*ip = rec->ip;
+
+	return entry;
+}
+
+int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
+{
+	struct ftrace_direct_func *direct;
+	struct ftrace_func_entry *entry;
+	int ret = -ENODEV;
+
+	mutex_lock(&direct_mutex);
+
+	entry = find_direct_entry(&ip);
+	if (!entry)
+		goto out_unlock;
 
 	if (direct_functions->count == 1)
 		unregister_ftrace_function(&direct_ops);
@@ -5187,24 +5197,13 @@ int modify_ftrace_direct(unsigned long ip,
 			 unsigned long old_addr, unsigned long new_addr)
 {
 	struct ftrace_func_entry *entry;
-	struct dyn_ftrace *rec;
 	int ret = -ENODEV;
 
 	mutex_lock(&direct_mutex);
-	entry = __ftrace_lookup_ip(direct_functions, ip);
-	if (!entry) {
-		/* OK if it is off by a little */
-		rec = lookup_rec(ip, ip);
-		if (!rec || rec->ip == ip)
-			goto out_unlock;
-
-		entry = __ftrace_lookup_ip(direct_functions, rec->ip);
-		if (!entry)
-			goto out_unlock;
 
-		ip = rec->ip;
-		WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
-	}
+	entry = find_direct_entry(&ip);
+	if (!entry)
+		goto out_unlock;
 
 	ret = -EINVAL;
 	if (entry->direct != old_addr)
-- 
cgit v1.2.3


From 4a169a95d885fe5c050bac1a21d43c86ba955bcf Mon Sep 17 00:00:00 2001
From: Maulik Shah <mkshah@codeaurora.org>
Date: Fri, 15 Nov 2019 15:11:49 -0700
Subject: genirq: Introduce irq_chip_get/set_parent_state calls

On certain QTI chipsets some GPIOs are direct-connect interrupts to the
GIC to be used as regular interrupt lines. When the GPIOs are not used
for interrupt generation the interrupt line is disabled. But disabling
the interrupt at GIC does not prevent the interrupt to be reported as
pending at GIC_ISPEND. Later, when drivers call enable_irq() on the
interrupt, an unwanted interrupt occurs.

Introduce get and set methods for irqchip's parent to clear it's pending
irq state. This then can be invoked by the GPIO interrupt controller on
the parents in it hierarchy to clear the interrupt before enabling the
interrupt.

Signed-off-by: Maulik Shah <mkshah@codeaurora.org>
Signed-off-by: Lina Iyer <ilina@codeaurora.org>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Reviewed-by: Stephen Boyd <swboyd@chromium.org>
Link: https://lore.kernel.org/r/1573855915-9841-7-git-send-email-ilina@codeaurora.org

[updated commit text and minor code fixes]
---
 kernel/irq/chip.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b76703b2c0af..b3fa2d87d2f3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1297,6 +1297,50 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq);
 
 #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */
 
+/**
+ * irq_chip_set_parent_state - set the state of a parent interrupt.
+ *
+ * @data: Pointer to interrupt specific data
+ * @which: State to be restored (one of IRQCHIP_STATE_*)
+ * @val: Value corresponding to @which
+ *
+ * Conditional success, if the underlying irqchip does not implement it.
+ */
+int irq_chip_set_parent_state(struct irq_data *data,
+			      enum irqchip_irq_state which,
+			      bool val)
+{
+	data = data->parent_data;
+
+	if (!data || !data->chip->irq_set_irqchip_state)
+		return 0;
+
+	return data->chip->irq_set_irqchip_state(data, which, val);
+}
+EXPORT_SYMBOL_GPL(irq_chip_set_parent_state);
+
+/**
+ * irq_chip_get_parent_state - get the state of a parent interrupt.
+ *
+ * @data: Pointer to interrupt specific data
+ * @which: one of IRQCHIP_STATE_* the caller wants to know
+ * @state: a pointer to a boolean where the state is to be stored
+ *
+ * Conditional success, if the underlying irqchip does not implement it.
+ */
+int irq_chip_get_parent_state(struct irq_data *data,
+			      enum irqchip_irq_state which,
+			      bool *state)
+{
+	data = data->parent_data;
+
+	if (!data || !data->chip->irq_get_irqchip_state)
+		return 0;
+
+	return data->chip->irq_get_irqchip_state(data, which, state);
+}
+EXPORT_SYMBOL_GPL(irq_chip_get_parent_state);
+
 /**
  * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if
  * NULL)
-- 
cgit v1.2.3


From ea806eb3eab35528b578a061b2c4b28f0f92c465 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Sun, 17 Nov 2019 17:04:15 -0500
Subject: ftrace: Add a helper function to modify_ftrace_direct() to allow arch
 optimization

If a direct ftrace callback is at a location that does not have any other
ftrace helpers attached to it, it is possible to simply just change the
text to call the new caller (if the architecture supports it). But this
requires special architecture code. Currently, modify_ftrace_direct() uses a
trick to add a stub ftrace callback to the location forcing it to call the
ftrace iterator. Then it can change the direct helper to call the new
function in C, and then remove the stub. Removing the stub will have the
location now call the new location that the direct helper is using.

The new helper function does the registering the stub trick, but is a weak
function, allowing an architecture to override it to do something a bit more
direct.

Link: https://lore.kernel.org/r/20191115215125.mbqv7taqnx376yed@ast-mbp.dhcp.thefacebook.com

Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 120 ++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 88 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ef79c8393f53..caae523f4ef3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1020,12 +1020,6 @@ static bool update_all_ops;
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-struct ftrace_func_entry {
-	struct hlist_node hlist;
-	unsigned long ip;
-	unsigned long direct; /* for direct lookup only */
-};
-
 struct ftrace_func_probe {
 	struct ftrace_probe_ops	*probe_ops;
 	struct ftrace_ops	ops;
@@ -5112,7 +5106,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(register_ftrace_direct);
 
-static struct ftrace_func_entry *find_direct_entry(unsigned long *ip)
+static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
+						   struct dyn_ftrace **recp)
 {
 	struct ftrace_func_entry *entry;
 	struct dyn_ftrace *rec;
@@ -5132,6 +5127,9 @@ static struct ftrace_func_entry *find_direct_entry(unsigned long *ip)
 	/* Passed in ip just needs to be on the call site */
 	*ip = rec->ip;
 
+	if (recp)
+		*recp = rec;
+
 	return entry;
 }
 
@@ -5143,7 +5141,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
 
 	mutex_lock(&direct_mutex);
 
-	entry = find_direct_entry(&ip);
+	entry = find_direct_entry(&ip, NULL);
 	if (!entry)
 		goto out_unlock;
 
@@ -5179,6 +5177,75 @@ static struct ftrace_ops stub_ops = {
 	.func		= ftrace_stub,
 };
 
+/**
+ * ftrace_modify_direct_caller - modify ftrace nop directly
+ * @entry: The ftrace hash entry of the direct helper for @rec
+ * @rec: The record representing the function site to patch
+ * @old_addr: The location that the site at @rec->ip currently calls
+ * @new_addr: The location that the site at @rec->ip should call
+ *
+ * An architecture may overwrite this function to optimize the
+ * changing of the direct callback on an ftrace nop location.
+ * This is called with the ftrace_lock mutex held, and no other
+ * ftrace callbacks are on the associated record (@rec). Thus,
+ * it is safe to modify the ftrace record, where it should be
+ * currently calling @old_addr directly, to call @new_addr.
+ *
+ * Safety checks should be made to make sure that the code at
+ * @rec->ip is currently calling @old_addr. And this must
+ * also update entry->direct to @new_addr.
+ */
+int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
+				       struct dyn_ftrace *rec,
+				       unsigned long old_addr,
+				       unsigned long new_addr)
+{
+	unsigned long ip = rec->ip;
+	int ret;
+
+	/*
+	 * The ftrace_lock was used to determine if the record
+	 * had more than one registered user to it. If it did,
+	 * we needed to prevent that from changing to do the quick
+	 * switch. But if it did not (only a direct caller was attached)
+	 * then this function is called. But this function can deal
+	 * with attached callers to the rec that we care about, and
+	 * since this function uses standard ftrace calls that take
+	 * the ftrace_lock mutex, we need to release it.
+	 */
+	mutex_unlock(&ftrace_lock);
+
+	/*
+	 * By setting a stub function at the same address, we force
+	 * the code to call the iterator and the direct_ops helper.
+	 * This means that @ip does not call the direct call, and
+	 * we can simply modify it.
+	 */
+	ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
+	if (ret)
+		goto out_lock;
+
+	ret = register_ftrace_function(&stub_ops);
+	if (ret) {
+		ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+		goto out_lock;
+	}
+
+	entry->direct = new_addr;
+
+	/*
+	 * By removing the stub, we put back the direct call, calling
+	 * the @new_addr.
+	 */
+	unregister_ftrace_function(&stub_ops);
+	ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
+
+ out_lock:
+	mutex_lock(&ftrace_lock);
+
+	return ret;
+}
+
 /**
  * modify_ftrace_direct - Modify an existing direct call to call something else
  * @ip: The instruction pointer to modify
@@ -5197,11 +5264,13 @@ int modify_ftrace_direct(unsigned long ip,
 			 unsigned long old_addr, unsigned long new_addr)
 {
 	struct ftrace_func_entry *entry;
+	struct dyn_ftrace *rec;
 	int ret = -ENODEV;
 
 	mutex_lock(&direct_mutex);
 
-	entry = find_direct_entry(&ip);
+	mutex_lock(&ftrace_lock);
+	entry = find_direct_entry(&ip, &rec);
 	if (!entry)
 		goto out_unlock;
 
@@ -5210,33 +5279,20 @@ int modify_ftrace_direct(unsigned long ip,
 		goto out_unlock;
 
 	/*
-	 * By setting a stub function at the same address, we force
-	 * the code to call the iterator and the direct_ops helper.
-	 * This means that @ip does not call the direct call, and
-	 * we can simply modify it.
+	 * If there's no other ftrace callback on the rec->ip location,
+	 * then it can be changed directly by the architecture.
+	 * If there is another caller, then we just need to change the
+	 * direct caller helper to point to @new_addr.
 	 */
-	ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
-	if (ret)
-		goto out_unlock;
-
-	ret = register_ftrace_function(&stub_ops);
-	if (ret) {
-		ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-		goto out_unlock;
+	if (ftrace_rec_count(rec) == 1) {
+		ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
+	} else {
+		entry->direct = new_addr;
+		ret = 0;
 	}
 
-	entry->direct = new_addr;
-
-	/*
-	 * By removing the stub, we put back the direct call, calling
-	 * the @new_addr.
-	 */
-	unregister_ftrace_function(&stub_ops);
-	ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-
-	ret = 0;
-
  out_unlock:
+	mutex_unlock(&ftrace_lock);
 	mutex_unlock(&direct_mutex);
 	return ret;
 }
-- 
cgit v1.2.3


From 46f9469247c6f4697cbbf37e4b3961120bf07f29 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Mon, 18 Nov 2019 10:41:29 -0500
Subject: ftrace: Rename ftrace_graph_stub to ftrace_stub_graph

The ftrace_graph_stub was created and points to ftrace_stub as a way to
assign the functon graph tracer function pointer to a stub function with a
different prototype than what ftrace_stub has and not trigger the C
verifier. The ftrace_graph_stub was created via the linker script
vmlinux.lds.h. Unfortunately, powerpc already uses the name
ftrace_graph_stub for its internal implementation of the function graph
tracer, and even though powerpc would still build, the change via the linker
script broke function tracer on powerpc from working.

By using the name ftrace_stub_graph, which does not exist anywhere else in
the kernel, this should not be a problem.

Link: https://lore.kernel.org/r/1573849732.5937.136.camel@lca.pw

Fixes: b83b43ffc6e4 ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub")
Reorted-by: Qian Cai <cai@lca.pw>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/fgraph.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index fa3ce10d0405..67e0c462b059 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -336,10 +336,10 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
  * Simply points to ftrace_stub, but with the proper protocol.
  * Defined by the linker script in linux/vmlinux.lds.h
  */
-extern void ftrace_graph_stub(struct ftrace_graph_ret *);
+extern void ftrace_stub_graph(struct ftrace_graph_ret *);
 
 /* The callbacks that hook a function */
-trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub;
+trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
 static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
 
@@ -619,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
 		goto out;
 
 	ftrace_graph_active--;
-	ftrace_graph_return = ftrace_graph_stub;
+	ftrace_graph_return = ftrace_stub_graph;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	__ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
-- 
cgit v1.2.3


From c55b51a06b01d67a99457bb82a8c31081c7faa23 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 16 Nov 2019 14:16:12 +0100
Subject: cpuidle: Allow idle injection to apply exit latency limit

In some cases it may be useful to specify an exit latency limit for
the idle state to be used during CPU idle time injection.

Instead of duplicating the information in struct cpuidle_device
or propagating the latency limit in the call stack, replace the
use_deepest_state field with forced_latency_limit_ns to represent
that limit, so that the deepest idle state with exit latency within
that limit is forced (i.e. no governors) when it is set.

A zero exit latency limit for forced idle means to use governors in
the usual way (analogous to use_deepest_state equal to "false" before
this change).

Additionally, add play_idle_precise() taking two arguments, the
duration of forced idle and the idle state exit latency limit, both
in nanoseconds, and redefine play_idle() as a wrapper around that
new function.

This change is preparatory, no functional impact is expected.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: Subject, changelog, cpuidle_use_deepest_state() kerneldoc, whitespace ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/idle.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1aa260702b38..cd05ffa0abfe 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -165,7 +165,7 @@ static void cpuidle_idle_call(void)
 	 * until a proper wakeup interrupt happens.
 	 */
 
-	if (idle_should_enter_s2idle() || dev->use_deepest_state) {
+	if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
 		if (idle_should_enter_s2idle()) {
 			rcu_idle_enter();
 
@@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
-void play_idle(unsigned long duration_us)
+void play_idle_precise(u64 duration_ns, u64 latency_ns)
 {
 	struct idle_timer it;
 
@@ -323,29 +323,29 @@ void play_idle(unsigned long duration_us)
 	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
 	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
 	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
-	WARN_ON_ONCE(!duration_us);
+	WARN_ON_ONCE(!duration_ns);
 
 	rcu_sleep_check();
 	preempt_disable();
 	current->flags |= PF_IDLE;
-	cpuidle_use_deepest_state(true);
+	cpuidle_use_deepest_state(latency_ns);
 
 	it.done = 0;
 	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	it.timer.function = idle_inject_timer_fn;
-	hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC),
+	hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
 		      HRTIMER_MODE_REL_PINNED);
 
 	while (!READ_ONCE(it.done))
 		do_idle();
 
-	cpuidle_use_deepest_state(false);
+	cpuidle_use_deepest_state(0);
 	current->flags &= ~PF_IDLE;
 
 	preempt_fold_need_resched();
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(play_idle);
+EXPORT_SYMBOL_GPL(play_idle_precise);
 
 void cpu_startup_entry(enum cpuhp_state state)
 {
-- 
cgit v1.2.3


From 5aa9ba6312e36c18626e73506b92d1513d815435 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 16 Nov 2019 14:16:13 +0100
Subject: cpuidle: Pass exit latency limit to cpuidle_use_deepest_state()

Modify cpuidle_use_deepest_state() to take an additional exit latency
limit argument to be passed to find_deepest_idle_state() and make
cpuidle_idle_call() pass dev->forced_idle_latency_limit_ns to it for
forced idle.

Suggested-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
[ rjw: Rebase and rearrange code, subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/idle.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cd05ffa0abfe..fc9604ddd802 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -166,6 +166,8 @@ static void cpuidle_idle_call(void)
 	 */
 
 	if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
+		u64 max_latency_ns;
+
 		if (idle_should_enter_s2idle()) {
 			rcu_idle_enter();
 
@@ -176,12 +178,16 @@ static void cpuidle_idle_call(void)
 			}
 
 			rcu_idle_exit();
+
+			max_latency_ns = U64_MAX;
+		} else {
+			max_latency_ns = dev->forced_idle_latency_limit_ns;
 		}
 
 		tick_nohz_idle_stop_tick();
 		rcu_idle_enter();
 
-		next_state = cpuidle_find_deepest_state(drv, dev);
+		next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
 		call_cpuidle(drv, dev, next_state);
 	} else {
 		bool stop_tick = true;
-- 
cgit v1.2.3


From 56e35f9c5b87ec1ae93e483284e189c84388de16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Nov 2019 18:03:11 +0100
Subject: dma-mapping: drop the dev argument to arch_sync_dma_for_*

These are pure cache maintainance routines, so drop the unused
struct device argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 kernel/dma/direct.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 22a2e0833862..077876ae5c74 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -268,7 +268,7 @@ void dma_direct_sync_single_for_device(struct device *dev,
 		swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE);
 
 	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_device(dev, paddr, size, dir);
+		arch_sync_dma_for_device(paddr, size, dir);
 }
 EXPORT_SYMBOL(dma_direct_sync_single_for_device);
 
@@ -286,7 +286,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
 					dir, SYNC_FOR_DEVICE);
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_device(dev, paddr, sg->length,
+			arch_sync_dma_for_device(paddr, sg->length,
 					dir);
 	}
 }
@@ -302,8 +302,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev,
 	phys_addr_t paddr = dma_to_phys(dev, addr);
 
 	if (!dev_is_dma_coherent(dev)) {
-		arch_sync_dma_for_cpu(dev, paddr, size, dir);
-		arch_sync_dma_for_cpu_all(dev);
+		arch_sync_dma_for_cpu(paddr, size, dir);
+		arch_sync_dma_for_cpu_all();
 	}
 
 	if (unlikely(is_swiotlb_buffer(paddr)))
@@ -321,7 +321,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
 		if (!dev_is_dma_coherent(dev))
-			arch_sync_dma_for_cpu(dev, paddr, sg->length, dir);
+			arch_sync_dma_for_cpu(paddr, sg->length, dir);
 
 		if (unlikely(is_swiotlb_buffer(paddr)))
 			swiotlb_tbl_sync_single(dev, paddr, sg->length, dir,
@@ -329,7 +329,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
 	}
 
 	if (!dev_is_dma_coherent(dev))
-		arch_sync_dma_for_cpu_all(dev);
+		arch_sync_dma_for_cpu_all();
 }
 EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu);
 
@@ -380,7 +380,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
 	}
 
 	if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
-		arch_sync_dma_for_device(dev, phys, size, dir);
+		arch_sync_dma_for_device(phys, size, dir);
 	return dma_addr;
 }
 EXPORT_SYMBOL(dma_direct_map_page);
-- 
cgit v1.2.3


From 50f579a2399dee0ad1c86ea8159ab8657b74f95b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 19 Nov 2019 09:18:19 +0300
Subject: dma-debug: clean up put_hash_bucket()

The put_hash_bucket() is a bit cleaner if it takes an unsigned long
directly instead of a pointer to unsigned long.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/debug.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 004496654aaa..64972aa9bfc3 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -255,12 +255,10 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
  * Give up exclusive access to the hash bucket
  */
 static void put_hash_bucket(struct hash_bucket *bucket,
-			    unsigned long *flags)
+			    unsigned long flags)
 	__releases(&bucket->lock)
 {
-	unsigned long __flags = *flags;
-
-	spin_unlock_irqrestore(&bucket->lock, __flags);
+	spin_unlock_irqrestore(&bucket->lock, flags);
 }
 
 static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b)
@@ -359,7 +357,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket,
 		/*
 		 * Nothing found, go back a hash bucket
 		 */
-		put_hash_bucket(*bucket, flags);
+		put_hash_bucket(*bucket, *flags);
 		range          += (1 << HASH_FN_SHIFT);
 		index.dev_addr -= (1 << HASH_FN_SHIFT);
 		*bucket = get_hash_bucket(&index, flags);
@@ -609,7 +607,7 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 
 	bucket = get_hash_bucket(entry, &flags);
 	hash_bucket_add(bucket, entry);
-	put_hash_bucket(bucket, &flags);
+	put_hash_bucket(bucket, flags);
 
 	rc = active_cacheline_insert(entry);
 	if (rc == -ENOMEM) {
@@ -1002,7 +1000,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 
 	if (!entry) {
 		/* must drop lock before calling dma_mapping_error */
-		put_hash_bucket(bucket, &flags);
+		put_hash_bucket(bucket, flags);
 
 		if (dma_mapping_error(ref->dev, ref->dev_addr)) {
 			err_printk(ref->dev, NULL,
@@ -1084,7 +1082,7 @@ static void check_unmap(struct dma_debug_entry *ref)
 	hash_bucket_del(entry);
 	dma_entry_free(entry);
 
-	put_hash_bucket(bucket, &flags);
+	put_hash_bucket(bucket, flags);
 }
 
 static void check_for_stack(struct device *dev,
@@ -1204,7 +1202,7 @@ static void check_sync(struct device *dev,
 	}
 
 out:
-	put_hash_bucket(bucket, &flags);
+	put_hash_bucket(bucket, flags);
 }
 
 static void check_sg_segment(struct device *dev, struct scatterlist *sg)
@@ -1319,7 +1317,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 		}
 	}
 
-	put_hash_bucket(bucket, &flags);
+	put_hash_bucket(bucket, flags);
 }
 EXPORT_SYMBOL(debug_dma_mapping_error);
 
@@ -1392,7 +1390,7 @@ static int get_nr_mapped_entries(struct device *dev,
 
 	if (entry)
 		mapped_ents = entry->sg_mapped_ents;
-	put_hash_bucket(bucket, &flags);
+	put_hash_bucket(bucket, flags);
 
 	return mapped_ents;
 }
-- 
cgit v1.2.3


From 4268ac6ae5870af10a7417b22990d615f72f77e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2019 17:35:36 +0100
Subject: dma-direct: don't check swiotlb=force in dma_direct_map_resource

When mapping resources we can't just use swiotlb ram for bounce
buffering.  Switch to a direct dma_capable check instead.

Fixes: cfced786969c ("dma-mapping: remove the default map_resource implementation")
Reported-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 kernel/dma/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 077876ae5c74..a479bd2d1e8b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 {
 	dma_addr_t dma_addr = paddr;
 
-	if (unlikely(!dma_direct_possible(dev, dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, dma_addr, size))) {
 		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
 	}
-- 
cgit v1.2.3


From 68a33b1794665ba8a1d1ef1d3bfcc7c587d380a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 19 Nov 2019 17:38:58 +0100
Subject: dma-direct: exclude dma_direct_map_resource from the min_low_pfn
 check

The valid memory address check in dma_capable only makes sense when mapping
normal memory, not when using dma_map_resource to map a device resource.
Add a new boolean argument to dma_capable to exclude that check for the
dma_map_resource case.

Fixes: b12d66278dd6 ("dma-direct: check for overflows on 32 bit DMA addresses")
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 kernel/dma/direct.c  | 4 ++--
 kernel/dma/swiotlb.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index a479bd2d1e8b..40f1f0aac4b1 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -363,7 +363,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
 		size_t size)
 {
 	return swiotlb_force != SWIOTLB_FORCE &&
-		dma_capable(dev, dma_addr, size);
+		dma_capable(dev, dma_addr, size, true);
 }
 
 dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
@@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
 {
 	dma_addr_t dma_addr = paddr;
 
-	if (unlikely(!dma_capable(dev, dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
 		report_addr(dev, dma_addr, size);
 		return DMA_MAPPING_ERROR;
 	}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 673a2cdb2656..9280d6f8271e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -678,7 +678,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
 
 	/* Ensure that the address returned is DMA'ble */
 	*dma_addr = __phys_to_dma(dev, *phys);
-	if (unlikely(!dma_capable(dev, *dma_addr, size))) {
+	if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
 		swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
 			attrs | DMA_ATTR_SKIP_CPU_SYNC);
 		return false;
-- 
cgit v1.2.3


From 7b8474466ed97be458c825f34a85f2c2b84c3f95 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Thu, 21 Nov 2019 00:03:03 +0000
Subject: time: Zero the upper 32-bits in __kernel_timespec on 32-bit

On compat interfaces, the high order bits of nanoseconds should be zeroed
out. This is because the application code or the libc do not guarantee
zeroing of these. If used without zeroing, kernel might be at risk of using
timespec values incorrectly.

Originally it was handled correctly, but lost during is_compat_syscall()
cleanup. Revert the condition back to check CONFIG_64BIT.

Fixes: 98f76206b335 ("compat: Cleanup in_compat_syscall() callers")
Reported-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Dmitry Safonov <dima@arista.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20191121000303.126523-1-dima@arista.com
---
 kernel/time/time.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index 5c54ca632d08..83f403e7a15c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -881,7 +881,8 @@ int get_timespec64(struct timespec64 *ts,
 	ts->tv_sec = kts.tv_sec;
 
 	/* Zero out the padding for 32 bit systems or in compat mode */
-	if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall())
+	if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) ||
+					      in_compat_syscall()))
 		kts.tv_nsec &= 0xFFFFFFFFUL;
 
 	ts->tv_nsec = kts.tv_nsec;
-- 
cgit v1.2.3


From a7ba70f1787f977f970cd116076c6fce4b9e01cc Mon Sep 17 00:00:00 2001
From: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Date: Thu, 21 Nov 2019 10:26:44 +0100
Subject: dma-mapping: treat dev->bus_dma_mask as a DMA limit

Using a mask to represent bus DMA constraints has a set of limitations.
The biggest one being it can only hold a power of two (minus one). The
DMA mapping code is already aware of this and treats dev->bus_dma_mask
as a limit. This quirk is already used by some architectures although
still rare.

With the introduction of the Raspberry Pi 4 we've found a new contender
for the use of bus DMA limits, as its PCIe bus can only address the
lower 3GB of memory (of a total of 4GB). This is impossible to represent
with a mask. To make things worse the device-tree code rounds non power
of two bus DMA limits to the next power of two, which is unacceptable in
this case.

In the light of this, rename dev->bus_dma_mask to dev->bus_dma_limit all
over the tree and treat it as such. Note that dev->bus_dma_limit should
contain the higher accessible DMA address.

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/dma/direct.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 267b23a13b69..6af7ae83c4ad 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -27,10 +27,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
 {
 	if (!dev->dma_mask) {
 		dev_err_once(dev, "DMA map on device without dma_mask\n");
-	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
+	} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
 		dev_err_once(dev,
-			"overflow %pad+%zu of DMA mask %llx bus mask %llx\n",
-			&dma_addr, size, *dev->dma_mask, dev->bus_dma_mask);
+			"overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
+			&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
 	}
 	WARN_ON_ONCE(1);
 }
@@ -57,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev)
 }
 
 static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
-		u64 *phys_mask)
+		u64 *phys_limit)
 {
-	if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
-		dma_mask = dev->bus_dma_mask;
+	u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit);
 
 	if (force_dma_unencrypted(dev))
-		*phys_mask = __dma_to_phys(dev, dma_mask);
+		*phys_limit = __dma_to_phys(dev, dma_limit);
 	else
-		*phys_mask = dma_to_phys(dev, dma_mask);
+		*phys_limit = dma_to_phys(dev, dma_limit);
 
 	/*
 	 * Optimistically try the zone that the physical address mask falls
@@ -75,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 	 * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding
 	 * zones.
 	 */
-	if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits))
+	if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits))
 		return GFP_DMA;
-	if (*phys_mask <= DMA_BIT_MASK(32))
+	if (*phys_limit <= DMA_BIT_MASK(32))
 		return GFP_DMA32;
 	return 0;
 }
@@ -85,7 +84,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
 	return phys_to_dma_direct(dev, phys) + size - 1 <=
-			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
+			min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
 struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -94,7 +93,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	size_t alloc_size = PAGE_ALIGN(size);
 	int node = dev_to_node(dev);
 	struct page *page = NULL;
-	u64 phys_mask;
+	u64 phys_limit;
 
 	if (attrs & DMA_ATTR_NO_WARN)
 		gfp |= __GFP_NOWARN;
@@ -102,7 +101,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
 	/* we always manually zero the memory once we are done: */
 	gfp &= ~__GFP_ZERO;
 	gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
-			&phys_mask);
+			&phys_limit);
 	page = dma_alloc_contiguous(dev, alloc_size, gfp);
 	if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
 		dma_free_contiguous(dev, page, alloc_size);
@@ -116,7 +115,7 @@ again:
 		page = NULL;
 
 		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
-		    phys_mask < DMA_BIT_MASK(64) &&
+		    phys_limit < DMA_BIT_MASK(64) &&
 		    !(gfp & (GFP_DMA32 | GFP_DMA))) {
 			gfp |= GFP_DMA32;
 			goto again;
-- 
cgit v1.2.3


From 0e4a459f56c32d3e52ae69a4b447db2f48a65f44 Mon Sep 17 00:00:00 2001
From: Kusanagi Kouichi <slash@ac.auone-net.jp>
Date: Wed, 20 Nov 2019 19:43:50 +0900
Subject: tracing: Remove unnecessary DEBUG_FS dependency

Tracing replaced debugfs with tracefs.

Signed-off-by: Kusanagi Kouichi <slash@ac.auone-net.jp>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20191120104350753.EWCT.12796.ppp.dion.ne.jp@dmta0009.auone-net.jp
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/trace/Kconfig | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e08527f50d2a..382628b9b759 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -106,7 +106,6 @@ config PREEMPTIRQ_TRACEPOINTS
 
 config TRACING
 	bool
-	select DEBUG_FS
 	select RING_BUFFER
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select TRACEPOINTS
-- 
cgit v1.2.3


From a82a4804b4ee3636c8988fea14d44f70f4de45f1 Mon Sep 17 00:00:00 2001
From: Xianting Tian <xianting_tian@126.com>
Date: Sat, 16 Nov 2019 10:05:55 -0500
Subject: ring-buffer: Fix typos in function ring_buffer_producer

Fix spelling and other typos

Link: http://lkml.kernel.org/r/1573916755-32478-1-git-send-email-xianting_tian@126.com

Signed-off-by: Xianting Tian <xianting_tian@126.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 09b0b49f346e..32149e46551c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -269,10 +269,10 @@ static void ring_buffer_producer(void)
 
 #ifndef CONFIG_PREEMPTION
 		/*
-		 * If we are a non preempt kernel, the 10 second run will
+		 * If we are a non preempt kernel, the 10 seconds run will
 		 * stop everything while it runs. Instead, we will call
 		 * cond_resched and also add any time that was lost by a
-		 * rescedule.
+		 * reschedule.
 		 *
 		 * Do a cond resched at the same frequency we would wake up
 		 * the reader.
-- 
cgit v1.2.3


From fc809bc5ceaa665497384064ab2d76713c774bad Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 20 Nov 2019 21:38:07 +0800
Subject: tracing: Fix Kconfig indentation

Adjust indentation from spaces to tab (+optional two spaces) as in
coding style with command like:
	$ sed -e 's/^        /\t/' -i */Kconfig

Link: http://lkml.kernel.org/r/20191120133807.12741-1-krzk@kernel.org

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b872716bb2a0..f67620499faa 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -79,7 +79,7 @@ config FTRACE_NMI_ENTER
 
 config EVENT_TRACING
 	select CONTEXT_SWITCH_TRACER
-        select GLOB
+	select GLOB
 	bool
 
 config CONTEXT_SWITCH_TRACER
@@ -311,7 +311,7 @@ config TRACER_SNAPSHOT
 	      cat snapshot
 
 config TRACER_SNAPSHOT_PER_CPU_SWAP
-        bool "Allow snapshot to swap per CPU"
+	bool "Allow snapshot to swap per CPU"
 	depends on TRACER_SNAPSHOT
 	select RING_BUFFER_ALLOW_SWAP
 	help
@@ -683,7 +683,7 @@ config MMIOTRACE_TEST
 	  Say N, unless you absolutely know what you are doing.
 
 config TRACEPOINT_BENCHMARK
-        bool "Add tracepoint that benchmarks tracepoints"
+	bool "Add tracepoint that benchmarks tracepoints"
 	help
 	 This option creates the tracepoint "benchmark:benchmark_event".
 	 When the tracepoint is enabled, it kicks off a kernel thread that
@@ -732,7 +732,7 @@ config RING_BUFFER_STARTUP_TEST
        bool "Ring buffer startup self test"
        depends on RING_BUFFER
        help
-         Run a simple self test on the ring buffer on boot up. Late in the
+	 Run a simple self test on the ring buffer on boot up. Late in the
 	 kernel boot sequence, the test will start that kicks off
 	 a thread per cpu. Each thread will write various size events
 	 into the ring buffer. Another thread is created to send IPIs
-- 
cgit v1.2.3


From 28879787147358e8ffcae397f11748de3dd26577 Mon Sep 17 00:00:00 2001
From: Divya Indi <divya.indi@oracle.com>
Date: Wed, 20 Nov 2019 11:08:38 -0800
Subject: tracing: Adding new functions for kernel access to Ftrace instances

Adding 2 new functions -
1) struct trace_array *trace_array_get_by_name(const char *name);

Return pointer to a trace array with given name. If it does not exist,
create and return pointer to the new trace array.

2) int trace_array_set_clr_event(struct trace_array *tr,
const char *system ,const char *event, bool enable);

Enable/Disable events to this trace array.

Additionally,
- To handle reference counters, export trace_array_put()
- Due to introduction of the above 2 new functions, we no longer need to
  export - ftrace_set_clr_event & trace_array_create APIs.

Link: http://lkml.kernel.org/r/1574276919-11119-2-git-send-email-divya.indi@oracle.com

Signed-off-by: Divya Indi <divya.indi@oracle.com>
Reviewed-by: Aruna Ramakrishna <aruna.ramakrishna@oracle.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 96 +++++++++++++++++++++++++++++++++++----------
 kernel/trace/trace.h        |  1 -
 kernel/trace/trace_events.c | 27 ++++++++++++-
 3 files changed, 102 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 42659ce6ac0c..02a23a6e5e00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -301,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr)
 	this_tr->ref--;
 }
 
+/**
+ * trace_array_put - Decrement the reference counter for this trace array.
+ *
+ * NOTE: Use this when we no longer need the trace array returned by
+ * trace_array_get_by_name(). This ensures the trace array can be later
+ * destroyed.
+ *
+ */
 void trace_array_put(struct trace_array *this_tr)
 {
+	if (!this_tr)
+		return;
+
 	mutex_lock(&trace_types_lock);
 	__trace_array_put(this_tr);
 	mutex_unlock(&trace_types_lock);
 }
+EXPORT_SYMBOL_GPL(trace_array_put);
 
 int tracing_check_open_get_tr(struct trace_array *tr)
 {
@@ -8437,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr)
 	mutex_unlock(&trace_types_lock);
 }
 
-struct trace_array *trace_array_create(const char *name)
+static struct trace_array *trace_array_create(const char *name)
 {
 	struct trace_array *tr;
 	int ret;
 
-	mutex_lock(&event_mutex);
-	mutex_lock(&trace_types_lock);
-
-	ret = -EEXIST;
-	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-		if (tr->name && strcmp(tr->name, name) == 0)
-			goto out_unlock;
-	}
-
 	ret = -ENOMEM;
 	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
 	if (!tr)
-		goto out_unlock;
+		return ERR_PTR(ret);
 
 	tr->name = kstrdup(name, GFP_KERNEL);
 	if (!tr->name)
@@ -8499,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name)
 
 	list_add(&tr->list, &ftrace_trace_arrays);
 
-	mutex_unlock(&trace_types_lock);
-	mutex_unlock(&event_mutex);
+	tr->ref++;
+
 
 	return tr;
 
@@ -8510,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name)
 	kfree(tr->name);
 	kfree(tr);
 
- out_unlock:
-	mutex_unlock(&trace_types_lock);
-	mutex_unlock(&event_mutex);
-
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(trace_array_create);
 
 static int instance_mkdir(const char *name)
 {
-	return PTR_ERR_OR_ZERO(trace_array_create(name));
+	struct trace_array *tr;
+	int ret;
+
+	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
+
+	ret = -EEXIST;
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (tr->name && strcmp(tr->name, name) == 0)
+			goto out_unlock;
+	}
+
+	tr = trace_array_create(name);
+
+	ret = PTR_ERR_OR_ZERO(tr);
+
+out_unlock:
+	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
+	return ret;
+}
+
+/**
+ * trace_array_get_by_name - Create/Lookup a trace array, given its name.
+ * @name: The name of the trace array to be looked up/created.
+ *
+ * Returns pointer to trace array with given name.
+ * NULL, if it cannot be created.
+ *
+ * NOTE: This function increments the reference counter associated with the
+ * trace array returned. This makes sure it cannot be freed while in use.
+ * Use trace_array_put() once the trace array is no longer needed.
+ *
+ */
+struct trace_array *trace_array_get_by_name(const char *name)
+{
+	struct trace_array *tr;
+
+	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
+
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (tr->name && strcmp(tr->name, name) == 0)
+			goto out_unlock;
+	}
+
+	tr = trace_array_create(name);
+
+	if (IS_ERR(tr))
+		tr = NULL;
+out_unlock:
+	if (tr)
+		tr->ref++;
+
+	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
+	return tr;
 }
+EXPORT_SYMBOL_GPL(trace_array_get_by_name);
 
 static int __remove_instance(struct trace_array *tr)
 {
 	int i;
 
-	if (tr->ref || (tr->current_trace && tr->current_trace->ref))
+	/* Reference counter for a newly created trace array = 1. */
+	if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
 		return -EBUSY;
 
 	list_del(&tr->list);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2df8aed6a8f0..ca7fccafbcbb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -345,7 +345,6 @@ extern struct list_head ftrace_trace_arrays;
 extern struct mutex trace_types_lock;
 
 extern int trace_array_get(struct trace_array *tr);
-extern void trace_array_put(struct trace_array *tr);
 extern int tracing_check_open_get_tr(struct trace_array *tr);
 
 extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2a3ac2365445..6b3a69e9aa6a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -827,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(ftrace_set_clr_event);
 
 /**
  * trace_set_clr_event - enable or disable an event
@@ -852,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set)
 }
 EXPORT_SYMBOL_GPL(trace_set_clr_event);
 
+/**
+ * trace_array_set_clr_event - enable or disable an event for a trace array.
+ * @tr: concerned trace array.
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @enable: true to enable, false to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_array_set_clr_event(struct trace_array *tr, const char *system,
+		const char *event, bool enable)
+{
+	int set;
+
+	if (!tr)
+		return -ENOENT;
+
+	set = (enable == true) ? 1 : 0;
+	return __ftrace_set_clr_event(tr, NULL, system, event, set);
+}
+EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
+
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
 
-- 
cgit v1.2.3


From 0e24220821b0e0e330a18bfef29ac6396545d62e Mon Sep 17 00:00:00 2001
From: Hassan Naveed <hnaveed@wavecomp.com>
Date: Fri, 15 Nov 2019 23:44:42 +0000
Subject: tracing: Use xarray for syscall trace events

Currently, a lot of memory is wasted for architectures like MIPS when
init_ftrace_syscalls() allocates the array for syscalls using kcalloc.
This is because syscalls numbers start from 4000, 5000 or 6000 and
array elements up to that point are unused.
Fix this by using a data structure more suited to storing sparsely
populated arrays. The XARRAY data structure, implemented using radix
trees, is much more memory efficient for storing the syscalls in
question.

Link: http://lkml.kernel.org/r/20191115234314.21599-1-hnaveed@wavecomp.com

Signed-off-by: Hassan Naveed <hnaveed@wavecomp.com>
Reviewed-by: Paul Burton <paulburton@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index fa8fbff736d6..16fa218556fa 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
+#include <linux/xarray.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call)
 extern struct syscall_metadata *__start_syscalls_metadata[];
 extern struct syscall_metadata *__stop_syscalls_metadata[];
 
+static DEFINE_XARRAY(syscalls_metadata_sparse);
 static struct syscall_metadata **syscalls_metadata;
 
 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
@@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall)
 
 static struct syscall_metadata *syscall_nr_to_meta(int nr)
 {
+	if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
+		return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
+
 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
 		return NULL;
 
@@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void)
 	struct syscall_metadata *meta;
 	unsigned long addr;
 	int i;
-
-	syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
-				    GFP_KERNEL);
-	if (!syscalls_metadata) {
-		WARN_ON(1);
-		return;
+	void *ret;
+
+	if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
+		syscalls_metadata = kcalloc(NR_syscalls,
+					sizeof(*syscalls_metadata),
+					GFP_KERNEL);
+		if (!syscalls_metadata) {
+			WARN_ON(1);
+			return;
+		}
 	}
 
 	for (i = 0; i < NR_syscalls; i++) {
@@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void)
 			continue;
 
 		meta->syscall_nr = i;
-		syscalls_metadata[i] = meta;
+
+		if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
+			syscalls_metadata[i] = meta;
+		} else {
+			ret = xa_store(&syscalls_metadata_sparse, i, meta,
+					GFP_KERNEL);
+			WARN(xa_is_err(ret),
+				"Syscall memory allocation failed\n");
+		}
+
 	}
 }
 
-- 
cgit v1.2.3


From 107e899874e95dcddc779142942bf285eba38bc5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Tue, 12 Nov 2019 16:22:21 -0400
Subject: mm/hmm: define the pre-processor related parts of hmm.h even if
 disabled
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only the function calls are stubbed out with static inlines that always
fail. This is the standard way to write a header for an optional component
and makes it easier for drivers that only optionally need HMM_MIRROR.

Link: https://lore.kernel.org/r/20191112202231.3856-5-jgg@ziepe.ca
Reviewed-by: Jérôme Glisse <jglisse@redhat.com>
Tested-by: Ralph Campbell <rcampbell@nvidia.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 kernel/fork.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bcdf53125210..ca39cfc404e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,7 +40,6 @@
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
-#include <linux/hmm.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/vmacache.h>
-- 
cgit v1.2.3


From b111df8447acdeb4b9220f99d5d4b28f83eb56ad Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 25 Nov 2019 21:25:46 +0100
Subject: y2038: alarm: fix half-second cut-off

Changing alarm_itimer accidentally broke the logic for arithmetic
rounding of half seconds in the return code.

Change it to a constant based on NSEC_PER_SEC, as suggested by
Ben Hutchings.

Fixes: bd40a175769d ("y2038: itimer: change implementation to timespec64")
Reported-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 kernel/time/itimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 5872db9bd5f7..9e59c9ea92aa 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -297,7 +297,7 @@ static unsigned int alarm_setitimer(unsigned int seconds)
 	 * better return too much than too little anyway
 	 */
 	if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) ||
-	      it_old.it_value.tv_nsec >= 500000)
+	      it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2))
 		it_old.it_value.tv_sec++;
 
 	return it_old.it_value.tv_sec;
-- 
cgit v1.2.3


From 61a47c1ad3a4dc6882f01ebdc88138ac62d0df03 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 1 Oct 2019 13:01:19 -0500
Subject: sysctl: Remove the sysctl system call

This system call has been deprecated almost since it was introduced, and
in a survey of the linux distributions I can no longer find any of them
that enable CONFIG_SYSCTL_SYSCALL.  The only indication that I can find
that anyone might care is that a few of the defconfigs in the kernel
enable CONFIG_SYSCTL_SYSCALL.  However this appears in only 31 of 414
defconfigs in the kernel, so I suspect this symbols presence is simply
because it is harmless to include rather than because it is necessary.

As there appear to be no users of the sysctl system call, remove the
code.  As this removes one of the few uses of the internal kernel mount
of proc I hope this allows for even more simplifications of the proc
filesystem.

Cc: Alex Smith <alex.smith@imgtec.com>
Cc: Anders Berg <anders.berg@lsi.com>
Cc: Apelete Seketeli <apelete@seketeli.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Chee Nouk Phoon <cnphoon@altera.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Christian Ruppert <christian.ruppert@abilis.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Harvey Hunt <harvey.hunt@imgtec.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hongliang Tao <taohl@lemote.com>
Cc: Hua Yan <yanh@lemote.com>
Cc: Huacai Chen <chenhc@lemote.com>
Cc: John Crispin <blogic@openwrt.org>
Cc: Jonas Jensen <jonas.jensen@gmail.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Cc: Jun Nie <jun.nie@linaro.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Kevin Wells <kevin.wells@nxp.com>
Cc: Kumar Gala <galak@codeaurora.org>
Cc: Lars-Peter Clausen <lars@metafoo.de>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Noam Camus <noamc@ezchip.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Phil Edworthy <phil.edworthy@renesas.com>
Cc: Pierrick Hascoet <pierrick.hascoet@abilis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Roland Stigge <stigge@antcom.de>
Cc: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Scott Telford <stelford@cadence.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Steven J. Hill <Steven.Hill@imgtec.com>
Cc: Tanmay Inamdar <tinamdar@apm.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Wolfram Sang <w.sang@pengutronix.de>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/sysctl_binary.c | 1305 ------------------------------------------------
 1 file changed, 1305 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 73c132095a7b..7d550cc76a3b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -18,1317 +18,12 @@
 #include <linux/slab.h>
 #include <linux/compat.h>
 
-#ifdef CONFIG_SYSCTL_SYSCALL
-
-struct bin_table;
-typedef ssize_t bin_convert_t(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen);
-
-static bin_convert_t bin_dir;
-static bin_convert_t bin_string;
-static bin_convert_t bin_intvec;
-static bin_convert_t bin_ulongvec;
-static bin_convert_t bin_uuid;
-static bin_convert_t bin_dn_node_address;
-
-#define CTL_DIR   bin_dir
-#define CTL_STR   bin_string
-#define CTL_INT   bin_intvec
-#define CTL_ULONG bin_ulongvec
-#define CTL_UUID  bin_uuid
-#define CTL_DNADR bin_dn_node_address
-
-#define BUFSZ 256
-
-struct bin_table {
-	bin_convert_t		*convert;
-	int			ctl_name;
-	const char		*procname;
-	const struct bin_table	*child;
-};
-
-static const struct bin_table bin_random_table[] = {
-	{ CTL_INT,	RANDOM_POOLSIZE,	"poolsize" },
-	{ CTL_INT,	RANDOM_ENTROPY_COUNT,	"entropy_avail" },
-	{ CTL_INT,	RANDOM_READ_THRESH,	"read_wakeup_threshold" },
-	{ CTL_INT,	RANDOM_WRITE_THRESH,	"write_wakeup_threshold" },
-	{ CTL_UUID,	RANDOM_BOOT_ID,		"boot_id" },
-	{ CTL_UUID,	RANDOM_UUID,		"uuid" },
-	{}
-};
-
-static const struct bin_table bin_pty_table[] = {
-	{ CTL_INT,	PTY_MAX,	"max" },
-	{ CTL_INT,	PTY_NR,		"nr" },
-	{}
-};
-
-static const struct bin_table bin_kern_table[] = {
-	{ CTL_STR,	KERN_OSTYPE,			"ostype" },
-	{ CTL_STR,	KERN_OSRELEASE,			"osrelease" },
-	/* KERN_OSREV not used */
-	{ CTL_STR,	KERN_VERSION,			"version" },
-	/* KERN_SECUREMASK not used */
-	/* KERN_PROF not used */
-	{ CTL_STR,	KERN_NODENAME,			"hostname" },
-	{ CTL_STR,	KERN_DOMAINNAME,		"domainname" },
-
-	{ CTL_INT,	KERN_PANIC,			"panic" },
-	{ CTL_INT,	KERN_REALROOTDEV,		"real-root-dev" },
-
-	{ CTL_STR,	KERN_SPARC_REBOOT,		"reboot-cmd" },
-	{ CTL_INT,	KERN_CTLALTDEL,			"ctrl-alt-del" },
-	{ CTL_INT,	KERN_PRINTK,			"printk" },
-
-	/* KERN_NAMETRANS not used */
-	/* KERN_PPC_HTABRECLAIM not used */
-	/* KERN_PPC_ZEROPAGED not used */
-	{ CTL_INT,	KERN_PPC_POWERSAVE_NAP,		"powersave-nap" },
-
-	{ CTL_STR,	KERN_MODPROBE,			"modprobe" },
-	{ CTL_INT,	KERN_SG_BIG_BUFF,		"sg-big-buff" },
-	{ CTL_INT,	KERN_ACCT,			"acct" },
-	/* KERN_PPC_L2CR "l2cr" no longer used */
-
-	/* KERN_RTSIGNR not used */
-	/* KERN_RTSIGMAX not used */
-
-	{ CTL_ULONG,	KERN_SHMMAX,			"shmmax" },
-	{ CTL_INT,	KERN_MSGMAX,			"msgmax" },
-	{ CTL_INT,	KERN_MSGMNB,			"msgmnb" },
-	/* KERN_MSGPOOL not used*/
-	{ CTL_INT,	KERN_SYSRQ,			"sysrq" },
-	{ CTL_INT,	KERN_MAX_THREADS,		"threads-max" },
-	{ CTL_DIR,	KERN_RANDOM,			"random",	bin_random_table },
-	{ CTL_ULONG,	KERN_SHMALL,			"shmall" },
-	{ CTL_INT,	KERN_MSGMNI,			"msgmni" },
-	{ CTL_INT,	KERN_SEM,			"sem" },
-	{ CTL_INT,	KERN_SPARC_STOP_A,		"stop-a" },
-	{ CTL_INT,	KERN_SHMMNI,			"shmmni" },
-
-	{ CTL_INT,	KERN_OVERFLOWUID,		"overflowuid" },
-	{ CTL_INT,	KERN_OVERFLOWGID,		"overflowgid" },
-
-	{ CTL_STR,	KERN_HOTPLUG,			"hotplug", },
-	{ CTL_INT,	KERN_IEEE_EMULATION_WARNINGS,	"ieee_emulation_warnings" },
-
-	{ CTL_INT,	KERN_S390_USER_DEBUG_LOGGING,	"userprocess_debug" },
-	{ CTL_INT,	KERN_CORE_USES_PID,		"core_uses_pid" },
-	/* KERN_TAINTED "tainted" no longer used */
-	{ CTL_INT,	KERN_CADPID,			"cad_pid" },
-	{ CTL_INT,	KERN_PIDMAX,			"pid_max" },
-	{ CTL_STR,	KERN_CORE_PATTERN,		"core_pattern" },
-	{ CTL_INT,	KERN_PANIC_ON_OOPS,		"panic_on_oops" },
-	{ CTL_INT,	KERN_HPPA_PWRSW,		"soft-power" },
-	{ CTL_INT,	KERN_HPPA_UNALIGNED,		"unaligned-trap" },
-
-	{ CTL_INT,	KERN_PRINTK_RATELIMIT,		"printk_ratelimit" },
-	{ CTL_INT,	KERN_PRINTK_RATELIMIT_BURST,	"printk_ratelimit_burst" },
-
-	{ CTL_DIR,	KERN_PTY,			"pty",		bin_pty_table },
-	{ CTL_INT,	KERN_NGROUPS_MAX,		"ngroups_max" },
-	{ CTL_INT,	KERN_SPARC_SCONS_PWROFF,	"scons-poweroff" },
-	/* KERN_HZ_TIMER "hz_timer" no longer used */
-	{ CTL_INT,	KERN_UNKNOWN_NMI_PANIC,		"unknown_nmi_panic" },
-	{ CTL_INT,	KERN_BOOTLOADER_TYPE,		"bootloader_type" },
-	{ CTL_INT,	KERN_RANDOMIZE,			"randomize_va_space" },
-
-	{ CTL_INT,	KERN_SPIN_RETRY,		"spin_retry" },
-	/* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */
-	{ CTL_INT,	KERN_IA64_UNALIGNED,		"ignore-unaligned-usertrap" },
-	{ CTL_INT,	KERN_COMPAT_LOG,		"compat-log" },
-	{ CTL_INT,	KERN_MAX_LOCK_DEPTH,		"max_lock_depth" },
-	{ CTL_INT,	KERN_PANIC_ON_NMI,		"panic_on_unrecovered_nmi" },
-	{ CTL_INT,	KERN_PANIC_ON_WARN,		"panic_on_warn" },
-	{ CTL_ULONG,	KERN_PANIC_PRINT,		"panic_print" },
-	{}
-};
-
-static const struct bin_table bin_vm_table[] = {
-	{ CTL_INT,	VM_OVERCOMMIT_MEMORY,		"overcommit_memory" },
-	{ CTL_INT,	VM_PAGE_CLUSTER,		"page-cluster" },
-	{ CTL_INT,	VM_DIRTY_BACKGROUND,		"dirty_background_ratio" },
-	{ CTL_INT,	VM_DIRTY_RATIO,			"dirty_ratio" },
-	/* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */
-	/* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */
-	/* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */
-	{ CTL_INT,	VM_OVERCOMMIT_RATIO,		"overcommit_ratio" },
-	/* VM_PAGEBUF unused */
-	/* VM_HUGETLB_PAGES "nr_hugepages" no longer used */
-	{ CTL_INT,	VM_SWAPPINESS,			"swappiness" },
-	{ CTL_INT,	VM_LOWMEM_RESERVE_RATIO,	"lowmem_reserve_ratio" },
-	{ CTL_INT,	VM_MIN_FREE_KBYTES,		"min_free_kbytes" },
-	{ CTL_INT,	VM_MAX_MAP_COUNT,		"max_map_count" },
-	{ CTL_INT,	VM_LAPTOP_MODE,			"laptop_mode" },
-	{ CTL_INT,	VM_BLOCK_DUMP,			"block_dump" },
-	{ CTL_INT,	VM_HUGETLB_GROUP,		"hugetlb_shm_group" },
-	{ CTL_INT,	VM_VFS_CACHE_PRESSURE,	"vfs_cache_pressure" },
-	{ CTL_INT,	VM_LEGACY_VA_LAYOUT,		"legacy_va_layout" },
-	/* VM_SWAP_TOKEN_TIMEOUT unused */
-	{ CTL_INT,	VM_DROP_PAGECACHE,		"drop_caches" },
-	{ CTL_INT,	VM_PERCPU_PAGELIST_FRACTION,	"percpu_pagelist_fraction" },
-	{ CTL_INT,	VM_ZONE_RECLAIM_MODE,		"zone_reclaim_mode" },
-	{ CTL_INT,	VM_MIN_UNMAPPED,		"min_unmapped_ratio" },
-	{ CTL_INT,	VM_PANIC_ON_OOM,		"panic_on_oom" },
-	{ CTL_INT,	VM_VDSO_ENABLED,		"vdso_enabled" },
-	{ CTL_INT,	VM_MIN_SLAB,			"min_slab_ratio" },
-
-	{}
-};
-
-static const struct bin_table bin_net_core_table[] = {
-	{ CTL_INT,	NET_CORE_WMEM_MAX,	"wmem_max" },
-	{ CTL_INT,	NET_CORE_RMEM_MAX,	"rmem_max" },
-	{ CTL_INT,	NET_CORE_WMEM_DEFAULT,	"wmem_default" },
-	{ CTL_INT,	NET_CORE_RMEM_DEFAULT,	"rmem_default" },
-	/* NET_CORE_DESTROY_DELAY unused */
-	{ CTL_INT,	NET_CORE_MAX_BACKLOG,	"netdev_max_backlog" },
-	/* NET_CORE_FASTROUTE unused */
-	{ CTL_INT,	NET_CORE_MSG_COST,	"message_cost" },
-	{ CTL_INT,	NET_CORE_MSG_BURST,	"message_burst" },
-	{ CTL_INT,	NET_CORE_OPTMEM_MAX,	"optmem_max" },
-	/* NET_CORE_HOT_LIST_LENGTH unused */
-	/* NET_CORE_DIVERT_VERSION unused */
-	/* NET_CORE_NO_CONG_THRESH unused */
-	/* NET_CORE_NO_CONG unused */
-	/* NET_CORE_LO_CONG unused */
-	/* NET_CORE_MOD_CONG unused */
-	{ CTL_INT,	NET_CORE_DEV_WEIGHT,	"dev_weight" },
-	{ CTL_INT,	NET_CORE_SOMAXCONN,	"somaxconn" },
-	{ CTL_INT,	NET_CORE_BUDGET,	"netdev_budget" },
-	{ CTL_INT,	NET_CORE_AEVENT_ETIME,	"xfrm_aevent_etime" },
-	{ CTL_INT,	NET_CORE_AEVENT_RSEQTH,	"xfrm_aevent_rseqth" },
-	{ CTL_INT,	NET_CORE_WARNINGS,	"warnings" },
-	{},
-};
-
-static const struct bin_table bin_net_unix_table[] = {
-	/* NET_UNIX_DESTROY_DELAY unused */
-	/* NET_UNIX_DELETE_DELAY unused */
-	{ CTL_INT,	NET_UNIX_MAX_DGRAM_QLEN,	"max_dgram_qlen" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv4_route_table[] = {
-	{ CTL_INT,	NET_IPV4_ROUTE_FLUSH,			"flush" },
-	/* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */
-	/* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */
-	{ CTL_INT,	NET_IPV4_ROUTE_GC_THRESH,		"gc_thresh" },
-	{ CTL_INT,	NET_IPV4_ROUTE_MAX_SIZE,		"max_size" },
-	{ CTL_INT,	NET_IPV4_ROUTE_GC_MIN_INTERVAL,		"gc_min_interval" },
-	{ CTL_INT,	NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
-	{ CTL_INT,	NET_IPV4_ROUTE_GC_TIMEOUT,		"gc_timeout" },
-	/* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
-	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_LOAD,		"redirect_load" },
-	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_NUMBER,		"redirect_number" },
-	{ CTL_INT,	NET_IPV4_ROUTE_REDIRECT_SILENCE,	"redirect_silence" },
-	{ CTL_INT,	NET_IPV4_ROUTE_ERROR_COST,		"error_cost" },
-	{ CTL_INT,	NET_IPV4_ROUTE_ERROR_BURST,		"error_burst" },
-	{ CTL_INT,	NET_IPV4_ROUTE_GC_ELASTICITY,		"gc_elasticity" },
-	{ CTL_INT,	NET_IPV4_ROUTE_MTU_EXPIRES,		"mtu_expires" },
-	{ CTL_INT,	NET_IPV4_ROUTE_MIN_PMTU,		"min_pmtu" },
-	{ CTL_INT,	NET_IPV4_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv4_conf_vars_table[] = {
-	{ CTL_INT,	NET_IPV4_CONF_FORWARDING,		"forwarding" },
-	{ CTL_INT,	NET_IPV4_CONF_MC_FORWARDING,		"mc_forwarding" },
-
-	{ CTL_INT,	NET_IPV4_CONF_ACCEPT_REDIRECTS,		"accept_redirects" },
-	{ CTL_INT,	NET_IPV4_CONF_SECURE_REDIRECTS,		"secure_redirects" },
-	{ CTL_INT,	NET_IPV4_CONF_SEND_REDIRECTS,		"send_redirects" },
-	{ CTL_INT,	NET_IPV4_CONF_SHARED_MEDIA,		"shared_media" },
-	{ CTL_INT,	NET_IPV4_CONF_RP_FILTER,		"rp_filter" },
-	{ CTL_INT,	NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,	"accept_source_route" },
-	{ CTL_INT,	NET_IPV4_CONF_PROXY_ARP,		"proxy_arp" },
-	{ CTL_INT,	NET_IPV4_CONF_MEDIUM_ID,		"medium_id" },
-	{ CTL_INT,	NET_IPV4_CONF_BOOTP_RELAY,		"bootp_relay" },
-	{ CTL_INT,	NET_IPV4_CONF_LOG_MARTIANS,		"log_martians" },
-	{ CTL_INT,	NET_IPV4_CONF_TAG,			"tag" },
-	{ CTL_INT,	NET_IPV4_CONF_ARPFILTER,		"arp_filter" },
-	{ CTL_INT,	NET_IPV4_CONF_ARP_ANNOUNCE,		"arp_announce" },
-	{ CTL_INT,	NET_IPV4_CONF_ARP_IGNORE,		"arp_ignore" },
-	{ CTL_INT,	NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
-	{ CTL_INT,	NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
-
-	{ CTL_INT,	NET_IPV4_CONF_NOXFRM,			"disable_xfrm" },
-	{ CTL_INT,	NET_IPV4_CONF_NOPOLICY,			"disable_policy" },
-	{ CTL_INT,	NET_IPV4_CONF_FORCE_IGMP_VERSION,	"force_igmp_version" },
-	{ CTL_INT,	NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv4_conf_table[] = {
-	{ CTL_DIR,	NET_PROTO_CONF_ALL,	"all",		bin_net_ipv4_conf_vars_table },
-	{ CTL_DIR,	NET_PROTO_CONF_DEFAULT,	"default",	bin_net_ipv4_conf_vars_table },
-	{ CTL_DIR,	0, NULL, bin_net_ipv4_conf_vars_table },
-	{}
-};
-
-static const struct bin_table bin_net_neigh_vars_table[] = {
-	{ CTL_INT,	NET_NEIGH_MCAST_SOLICIT,	"mcast_solicit" },
-	{ CTL_INT,	NET_NEIGH_UCAST_SOLICIT,	"ucast_solicit" },
-	{ CTL_INT,	NET_NEIGH_APP_SOLICIT,		"app_solicit" },
-	/* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */
-	{ CTL_INT,	NET_NEIGH_REACHABLE_TIME,	"base_reachable_time" },
-	{ CTL_INT,	NET_NEIGH_DELAY_PROBE_TIME,	"delay_first_probe_time" },
-	{ CTL_INT,	NET_NEIGH_GC_STALE_TIME,	"gc_stale_time" },
-	{ CTL_INT,	NET_NEIGH_UNRES_QLEN,		"unres_qlen" },
-	{ CTL_INT,	NET_NEIGH_PROXY_QLEN,		"proxy_qlen" },
-	/* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */
-	/* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */
-	/* NET_NEIGH_LOCKTIME "locktime" no longer used */
-	{ CTL_INT,	NET_NEIGH_GC_INTERVAL,		"gc_interval" },
-	{ CTL_INT,	NET_NEIGH_GC_THRESH1,		"gc_thresh1" },
-	{ CTL_INT,	NET_NEIGH_GC_THRESH2,		"gc_thresh2" },
-	{ CTL_INT,	NET_NEIGH_GC_THRESH3,		"gc_thresh3" },
-	{ CTL_INT,	NET_NEIGH_RETRANS_TIME_MS,	"retrans_time_ms" },
-	{ CTL_INT,	NET_NEIGH_REACHABLE_TIME_MS,	"base_reachable_time_ms" },
-	{}
-};
-
-static const struct bin_table bin_net_neigh_table[] = {
-	{ CTL_DIR,	NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table },
-	{ CTL_DIR,	0, NULL, bin_net_neigh_vars_table },
-	{}
-};
-
-static const struct bin_table bin_net_ipv4_netfilter_table[] = {
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_MAX,		"ip_conntrack_max" },
-
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT	"ip_conntrack_tcp_timeout_close_wait" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */
-
-	/* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */
-
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_BUCKETS,		"ip_conntrack_buckets" },
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_LOG_INVALID,	"ip_conntrack_log_invalid" },
-	/* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_TCP_LOOSE,	"ip_conntrack_tcp_loose" },
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,	"ip_conntrack_tcp_be_liberal" },
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,	"ip_conntrack_tcp_max_retrans" },
-
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */
-	/* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
-
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_COUNT,		"ip_conntrack_count" },
-	{ CTL_INT,	NET_IPV4_NF_CONNTRACK_CHECKSUM,		"ip_conntrack_checksum" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv4_table[] = {
-	{CTL_INT,	NET_IPV4_FORWARD,			"ip_forward" },
-
-	{ CTL_DIR,	NET_IPV4_CONF,		"conf",		bin_net_ipv4_conf_table },
-	{ CTL_DIR,	NET_IPV4_NEIGH,		"neigh",	bin_net_neigh_table },
-	{ CTL_DIR,	NET_IPV4_ROUTE,		"route",	bin_net_ipv4_route_table },
-	/* NET_IPV4_FIB_HASH unused */
-	{ CTL_DIR,	NET_IPV4_NETFILTER,	"netfilter",	bin_net_ipv4_netfilter_table },
-
-	{ CTL_INT,	NET_IPV4_TCP_TIMESTAMPS,		"tcp_timestamps" },
-	{ CTL_INT,	NET_IPV4_TCP_WINDOW_SCALING,		"tcp_window_scaling" },
-	{ CTL_INT,	NET_IPV4_TCP_SACK,			"tcp_sack" },
-	{ CTL_INT,	NET_IPV4_TCP_RETRANS_COLLAPSE,		"tcp_retrans_collapse" },
-	{ CTL_INT,	NET_IPV4_DEFAULT_TTL,			"ip_default_ttl" },
-	/* NET_IPV4_AUTOCONFIG unused */
-	{ CTL_INT,	NET_IPV4_NO_PMTU_DISC,			"ip_no_pmtu_disc" },
-	{ CTL_INT,	NET_IPV4_NONLOCAL_BIND,			"ip_nonlocal_bind" },
-	{ CTL_INT,	NET_IPV4_TCP_SYN_RETRIES,		"tcp_syn_retries" },
-	{ CTL_INT,	NET_TCP_SYNACK_RETRIES,			"tcp_synack_retries" },
-	{ CTL_INT,	NET_TCP_MAX_ORPHANS,			"tcp_max_orphans" },
-	{ CTL_INT,	NET_TCP_MAX_TW_BUCKETS,			"tcp_max_tw_buckets" },
-	{ CTL_INT,	NET_IPV4_DYNADDR,			"ip_dynaddr" },
-	{ CTL_INT,	NET_IPV4_TCP_KEEPALIVE_TIME,		"tcp_keepalive_time" },
-	{ CTL_INT,	NET_IPV4_TCP_KEEPALIVE_PROBES,		"tcp_keepalive_probes" },
-	{ CTL_INT,	NET_IPV4_TCP_KEEPALIVE_INTVL,		"tcp_keepalive_intvl" },
-	{ CTL_INT,	NET_IPV4_TCP_RETRIES1,			"tcp_retries1" },
-	{ CTL_INT,	NET_IPV4_TCP_RETRIES2,			"tcp_retries2" },
-	{ CTL_INT,	NET_IPV4_TCP_FIN_TIMEOUT,		"tcp_fin_timeout" },
-	{ CTL_INT,	NET_TCP_SYNCOOKIES,			"tcp_syncookies" },
-	{ CTL_INT,	NET_TCP_TW_RECYCLE,			"tcp_tw_recycle" },
-	{ CTL_INT,	NET_TCP_ABORT_ON_OVERFLOW,		"tcp_abort_on_overflow" },
-	{ CTL_INT,	NET_TCP_STDURG,				"tcp_stdurg" },
-	{ CTL_INT,	NET_TCP_RFC1337,			"tcp_rfc1337" },
-	{ CTL_INT,	NET_TCP_MAX_SYN_BACKLOG,		"tcp_max_syn_backlog" },
-	{ CTL_INT,	NET_IPV4_LOCAL_PORT_RANGE,		"ip_local_port_range" },
-	{ CTL_INT,	NET_IPV4_IGMP_MAX_MEMBERSHIPS,		"igmp_max_memberships" },
-	{ CTL_INT,	NET_IPV4_IGMP_MAX_MSF,			"igmp_max_msf" },
-	{ CTL_INT,	NET_IPV4_INET_PEER_THRESHOLD,		"inet_peer_threshold" },
-	{ CTL_INT,	NET_IPV4_INET_PEER_MINTTL,		"inet_peer_minttl" },
-	{ CTL_INT,	NET_IPV4_INET_PEER_MAXTTL,		"inet_peer_maxttl" },
-	{ CTL_INT,	NET_IPV4_INET_PEER_GC_MINTIME,		"inet_peer_gc_mintime" },
-	{ CTL_INT,	NET_IPV4_INET_PEER_GC_MAXTIME,		"inet_peer_gc_maxtime" },
-	{ CTL_INT,	NET_TCP_ORPHAN_RETRIES,			"tcp_orphan_retries" },
-	{ CTL_INT,	NET_TCP_FACK,				"tcp_fack" },
-	{ CTL_INT,	NET_TCP_REORDERING,			"tcp_reordering" },
-	{ CTL_INT,	NET_TCP_ECN,				"tcp_ecn" },
-	{ CTL_INT,	NET_TCP_DSACK,				"tcp_dsack" },
-	{ CTL_INT,	NET_TCP_MEM,				"tcp_mem" },
-	{ CTL_INT,	NET_TCP_WMEM,				"tcp_wmem" },
-	{ CTL_INT,	NET_TCP_RMEM,				"tcp_rmem" },
-	{ CTL_INT,	NET_TCP_APP_WIN,			"tcp_app_win" },
-	{ CTL_INT,	NET_TCP_ADV_WIN_SCALE,			"tcp_adv_win_scale" },
-	{ CTL_INT,	NET_TCP_TW_REUSE,			"tcp_tw_reuse" },
-	{ CTL_INT,	NET_TCP_FRTO,				"tcp_frto" },
-	{ CTL_INT,	NET_TCP_FRTO_RESPONSE,			"tcp_frto_response" },
-	{ CTL_INT,	NET_TCP_LOW_LATENCY,			"tcp_low_latency" },
-	{ CTL_INT,	NET_TCP_NO_METRICS_SAVE,		"tcp_no_metrics_save" },
-	{ CTL_INT,	NET_TCP_MODERATE_RCVBUF,		"tcp_moderate_rcvbuf" },
-	{ CTL_INT,	NET_TCP_TSO_WIN_DIVISOR,		"tcp_tso_win_divisor" },
-	{ CTL_STR,	NET_TCP_CONG_CONTROL,			"tcp_congestion_control" },
-	{ CTL_INT,	NET_TCP_MTU_PROBING,			"tcp_mtu_probing" },
-	{ CTL_INT,	NET_TCP_BASE_MSS,			"tcp_base_mss" },
-	{ CTL_INT,	NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,	"tcp_workaround_signed_windows" },
-	{ CTL_INT,	NET_TCP_SLOW_START_AFTER_IDLE,		"tcp_slow_start_after_idle" },
-	{ CTL_INT,	NET_CIPSOV4_CACHE_ENABLE,		"cipso_cache_enable" },
-	{ CTL_INT,	NET_CIPSOV4_CACHE_BUCKET_SIZE,		"cipso_cache_bucket_size" },
-	{ CTL_INT,	NET_CIPSOV4_RBM_OPTFMT,			"cipso_rbm_optfmt" },
-	{ CTL_INT,	NET_CIPSOV4_RBM_STRICTVALID,		"cipso_rbm_strictvalid" },
-	/* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */
-	{ CTL_STR,	NET_TCP_ALLOWED_CONG_CONTROL,		"tcp_allowed_congestion_control" },
-	{ CTL_INT,	NET_TCP_MAX_SSTHRESH,			"tcp_max_ssthresh" },
-
-	{ CTL_INT,	NET_IPV4_ICMP_ECHO_IGNORE_ALL,		"icmp_echo_ignore_all" },
-	{ CTL_INT,	NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,	"icmp_echo_ignore_broadcasts" },
-	{ CTL_INT,	NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,	"icmp_ignore_bogus_error_responses" },
-	{ CTL_INT,	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,	"icmp_errors_use_inbound_ifaddr" },
-	{ CTL_INT,	NET_IPV4_ICMP_RATELIMIT,		"icmp_ratelimit" },
-	{ CTL_INT,	NET_IPV4_ICMP_RATEMASK,			"icmp_ratemask" },
-
-	{ CTL_INT,	NET_IPV4_IPFRAG_HIGH_THRESH,		"ipfrag_high_thresh" },
-	{ CTL_INT,	NET_IPV4_IPFRAG_LOW_THRESH,		"ipfrag_low_thresh" },
-	{ CTL_INT,	NET_IPV4_IPFRAG_TIME,			"ipfrag_time" },
-
-	{ CTL_INT,	NET_IPV4_IPFRAG_SECRET_INTERVAL,	"ipfrag_secret_interval" },
-	/* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
-
-	{ CTL_INT,	2088 /* NET_IPQ_QMAX */,		"ip_queue_maxlen" },
-
-	/* NET_TCP_DEFAULT_WIN_SCALE unused */
-	/* NET_TCP_BIC_BETA unused */
-	/* NET_IPV4_TCP_MAX_KA_PROBES unused */
-	/* NET_IPV4_IP_MASQ_DEBUG unused */
-	/* NET_TCP_SYN_TAILDROP unused */
-	/* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */
-	/* NET_IPV4_ICMP_DESTUNREACH_RATE unused */
-	/* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */
-	/* NET_IPV4_ICMP_PARAMPROB_RATE unused */
-	/* NET_IPV4_ICMP_ECHOREPLY_RATE unused */
-	/* NET_IPV4_ALWAYS_DEFRAG unused */
-	{}
-};
-
-static const struct bin_table bin_net_ipx_table[] = {
-	{ CTL_INT,	NET_IPX_PPROP_BROADCASTING,	"ipx_pprop_broadcasting" },
-	/* NET_IPX_FORWARDING unused */
-	{}
-};
-
-static const struct bin_table bin_net_atalk_table[] = {
-	{ CTL_INT,	NET_ATALK_AARP_EXPIRY_TIME,		"aarp-expiry-time" },
-	{ CTL_INT,	NET_ATALK_AARP_TICK_TIME,		"aarp-tick-time" },
-	{ CTL_INT,	NET_ATALK_AARP_RETRANSMIT_LIMIT,	"aarp-retransmit-limit" },
-	{ CTL_INT,	NET_ATALK_AARP_RESOLVE_TIME,		"aarp-resolve-time" },
-	{},
-};
-
-static const struct bin_table bin_net_netrom_table[] = {
-	{ CTL_INT,	NET_NETROM_DEFAULT_PATH_QUALITY,		"default_path_quality" },
-	{ CTL_INT,	NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER,	"obsolescence_count_initialiser" },
-	{ CTL_INT,	NET_NETROM_NETWORK_TTL_INITIALISER,		"network_ttl_initialiser" },
-	{ CTL_INT,	NET_NETROM_TRANSPORT_TIMEOUT,			"transport_timeout" },
-	{ CTL_INT,	NET_NETROM_TRANSPORT_MAXIMUM_TRIES,		"transport_maximum_tries" },
-	{ CTL_INT,	NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY,		"transport_acknowledge_delay" },
-	{ CTL_INT,	NET_NETROM_TRANSPORT_BUSY_DELAY,		"transport_busy_delay" },
-	{ CTL_INT,	NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE,	"transport_requested_window_size" },
-	{ CTL_INT,	NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT,	"transport_no_activity_timeout" },
-	{ CTL_INT,	NET_NETROM_ROUTING_CONTROL,			"routing_control" },
-	{ CTL_INT,	NET_NETROM_LINK_FAILS_COUNT,			"link_fails_count" },
-	{ CTL_INT,	NET_NETROM_RESET,				"reset" },
-	{}
-};
-
-static const struct bin_table bin_net_ax25_param_table[] = {
-	{ CTL_INT,	NET_AX25_IP_DEFAULT_MODE,	"ip_default_mode" },
-	{ CTL_INT,	NET_AX25_DEFAULT_MODE,		"ax25_default_mode" },
-	{ CTL_INT,	NET_AX25_BACKOFF_TYPE,		"backoff_type" },
-	{ CTL_INT,	NET_AX25_CONNECT_MODE,		"connect_mode" },
-	{ CTL_INT,	NET_AX25_STANDARD_WINDOW,	"standard_window_size" },
-	{ CTL_INT,	NET_AX25_EXTENDED_WINDOW,	"extended_window_size" },
-	{ CTL_INT,	NET_AX25_T1_TIMEOUT,		"t1_timeout" },
-	{ CTL_INT,	NET_AX25_T2_TIMEOUT,		"t2_timeout" },
-	{ CTL_INT,	NET_AX25_T3_TIMEOUT,		"t3_timeout" },
-	{ CTL_INT,	NET_AX25_IDLE_TIMEOUT,		"idle_timeout" },
-	{ CTL_INT,	NET_AX25_N2,			"maximum_retry_count" },
-	{ CTL_INT,	NET_AX25_PACLEN,		"maximum_packet_length" },
-	{ CTL_INT,	NET_AX25_PROTOCOL,		"protocol" },
-	{ CTL_INT,	NET_AX25_DAMA_SLAVE_TIMEOUT,	"dama_slave_timeout" },
-	{}
-};
-
-static const struct bin_table bin_net_ax25_table[] = {
-	{ CTL_DIR,	0, NULL, bin_net_ax25_param_table },
-	{}
-};
-
-static const struct bin_table bin_net_rose_table[] = {
-	{ CTL_INT,	NET_ROSE_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
-	{ CTL_INT,	NET_ROSE_CALL_REQUEST_TIMEOUT,		"call_request_timeout" },
-	{ CTL_INT,	NET_ROSE_RESET_REQUEST_TIMEOUT,		"reset_request_timeout" },
-	{ CTL_INT,	NET_ROSE_CLEAR_REQUEST_TIMEOUT,		"clear_request_timeout" },
-	{ CTL_INT,	NET_ROSE_ACK_HOLD_BACK_TIMEOUT,		"acknowledge_hold_back_timeout" },
-	{ CTL_INT,	NET_ROSE_ROUTING_CONTROL,		"routing_control" },
-	{ CTL_INT,	NET_ROSE_LINK_FAIL_TIMEOUT,		"link_fail_timeout" },
-	{ CTL_INT,	NET_ROSE_MAX_VCS,			"maximum_virtual_circuits" },
-	{ CTL_INT,	NET_ROSE_WINDOW_SIZE,			"window_size" },
-	{ CTL_INT,	NET_ROSE_NO_ACTIVITY_TIMEOUT,		"no_activity_timeout" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv6_conf_var_table[] = {
-	{ CTL_INT,	NET_IPV6_FORWARDING,			"forwarding" },
-	{ CTL_INT,	NET_IPV6_HOP_LIMIT,			"hop_limit" },
-	{ CTL_INT,	NET_IPV6_MTU,				"mtu" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_RA,			"accept_ra" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_REDIRECTS,		"accept_redirects" },
-	{ CTL_INT,	NET_IPV6_AUTOCONF,			"autoconf" },
-	{ CTL_INT,	NET_IPV6_DAD_TRANSMITS,			"dad_transmits" },
-	{ CTL_INT,	NET_IPV6_RTR_SOLICITS,			"router_solicitations" },
-	{ CTL_INT,	NET_IPV6_RTR_SOLICIT_INTERVAL,		"router_solicitation_interval" },
-	{ CTL_INT,	NET_IPV6_RTR_SOLICIT_DELAY,		"router_solicitation_delay" },
-	{ CTL_INT,	NET_IPV6_USE_TEMPADDR,			"use_tempaddr" },
-	{ CTL_INT,	NET_IPV6_TEMP_VALID_LFT,		"temp_valid_lft" },
-	{ CTL_INT,	NET_IPV6_TEMP_PREFERED_LFT,		"temp_prefered_lft" },
-	{ CTL_INT,	NET_IPV6_REGEN_MAX_RETRY,		"regen_max_retry" },
-	{ CTL_INT,	NET_IPV6_MAX_DESYNC_FACTOR,		"max_desync_factor" },
-	{ CTL_INT,	NET_IPV6_MAX_ADDRESSES,			"max_addresses" },
-	{ CTL_INT,	NET_IPV6_FORCE_MLD_VERSION,		"force_mld_version" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_RA_DEFRTR,		"accept_ra_defrtr" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_RA_PINFO,		"accept_ra_pinfo" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_RA_RTR_PREF,		"accept_ra_rtr_pref" },
-	{ CTL_INT,	NET_IPV6_RTR_PROBE_INTERVAL,		"router_probe_interval" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,	"accept_ra_rt_info_max_plen" },
-	{ CTL_INT,	NET_IPV6_PROXY_NDP,			"proxy_ndp" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_SOURCE_ROUTE,		"accept_source_route" },
-	{ CTL_INT,	NET_IPV6_ACCEPT_RA_FROM_LOCAL,		"accept_ra_from_local" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv6_conf_table[] = {
-	{ CTL_DIR,	NET_PROTO_CONF_ALL,		"all",	bin_net_ipv6_conf_var_table },
-	{ CTL_DIR,	NET_PROTO_CONF_DEFAULT, 	"default", bin_net_ipv6_conf_var_table },
-	{ CTL_DIR,	0, NULL, bin_net_ipv6_conf_var_table },
-	{}
-};
-
-static const struct bin_table bin_net_ipv6_route_table[] = {
-	/* NET_IPV6_ROUTE_FLUSH	"flush"  no longer used */
-	{ CTL_INT,	NET_IPV6_ROUTE_GC_THRESH,		"gc_thresh" },
-	{ CTL_INT,	NET_IPV6_ROUTE_MAX_SIZE,		"max_size" },
-	{ CTL_INT,	NET_IPV6_ROUTE_GC_MIN_INTERVAL,		"gc_min_interval" },
-	{ CTL_INT,	NET_IPV6_ROUTE_GC_TIMEOUT,		"gc_timeout" },
-	{ CTL_INT,	NET_IPV6_ROUTE_GC_INTERVAL,		"gc_interval" },
-	{ CTL_INT,	NET_IPV6_ROUTE_GC_ELASTICITY,		"gc_elasticity" },
-	{ CTL_INT,	NET_IPV6_ROUTE_MTU_EXPIRES,		"mtu_expires" },
-	{ CTL_INT,	NET_IPV6_ROUTE_MIN_ADVMSS,		"min_adv_mss" },
-	{ CTL_INT,	NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,	"gc_min_interval_ms" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv6_icmp_table[] = {
-	{ CTL_INT,	NET_IPV6_ICMP_RATELIMIT,	"ratelimit" },
-	{}
-};
-
-static const struct bin_table bin_net_ipv6_table[] = {
-	{ CTL_DIR,	NET_IPV6_CONF,		"conf",		bin_net_ipv6_conf_table },
-	{ CTL_DIR,	NET_IPV6_NEIGH,		"neigh",	bin_net_neigh_table },
-	{ CTL_DIR,	NET_IPV6_ROUTE,		"route",	bin_net_ipv6_route_table },
-	{ CTL_DIR,	NET_IPV6_ICMP,		"icmp",		bin_net_ipv6_icmp_table },
-	{ CTL_INT,	NET_IPV6_BINDV6ONLY,		"bindv6only" },
-	{ CTL_INT,	NET_IPV6_IP6FRAG_HIGH_THRESH,	"ip6frag_high_thresh" },
-	{ CTL_INT,	NET_IPV6_IP6FRAG_LOW_THRESH,	"ip6frag_low_thresh" },
-	{ CTL_INT,	NET_IPV6_IP6FRAG_TIME,		"ip6frag_time" },
-	{ CTL_INT,	NET_IPV6_IP6FRAG_SECRET_INTERVAL,	"ip6frag_secret_interval" },
-	{ CTL_INT,	NET_IPV6_MLD_MAX_MSF,		"mld_max_msf" },
-	{ CTL_INT,	2088 /* IPQ_QMAX */,		"ip6_queue_maxlen" },
-	{}
-};
-
-static const struct bin_table bin_net_x25_table[] = {
-	{ CTL_INT,	NET_X25_RESTART_REQUEST_TIMEOUT,	"restart_request_timeout" },
-	{ CTL_INT,	NET_X25_CALL_REQUEST_TIMEOUT,		"call_request_timeout" },
-	{ CTL_INT,	NET_X25_RESET_REQUEST_TIMEOUT,	"reset_request_timeout" },
-	{ CTL_INT,	NET_X25_CLEAR_REQUEST_TIMEOUT,	"clear_request_timeout" },
-	{ CTL_INT,	NET_X25_ACK_HOLD_BACK_TIMEOUT,	"acknowledgement_hold_back_timeout" },
-	{ CTL_INT,	NET_X25_FORWARD,			"x25_forward" },
-	{}
-};
-
-static const struct bin_table bin_net_tr_table[] = {
-	{ CTL_INT,	NET_TR_RIF_TIMEOUT,	"rif_timeout" },
-	{}
-};
-
-
-static const struct bin_table bin_net_decnet_conf_vars[] = {
-	{ CTL_INT,	NET_DECNET_CONF_DEV_FORWARDING,	"forwarding" },
-	{ CTL_INT,	NET_DECNET_CONF_DEV_PRIORITY,	"priority" },
-	{ CTL_INT,	NET_DECNET_CONF_DEV_T2,		"t2" },
-	{ CTL_INT,	NET_DECNET_CONF_DEV_T3,		"t3" },
-	{}
-};
-
-static const struct bin_table bin_net_decnet_conf[] = {
-	{ CTL_DIR, NET_DECNET_CONF_ETHER,    "ethernet", bin_net_decnet_conf_vars },
-	{ CTL_DIR, NET_DECNET_CONF_GRE,	     "ipgre",    bin_net_decnet_conf_vars },
-	{ CTL_DIR, NET_DECNET_CONF_X25,	     "x25",      bin_net_decnet_conf_vars },
-	{ CTL_DIR, NET_DECNET_CONF_PPP,	     "ppp",      bin_net_decnet_conf_vars },
-	{ CTL_DIR, NET_DECNET_CONF_DDCMP,    "ddcmp",    bin_net_decnet_conf_vars },
-	{ CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars },
-	{ CTL_DIR, 0,			     NULL,	 bin_net_decnet_conf_vars },
-	{}
-};
-
-static const struct bin_table bin_net_decnet_table[] = {
-	{ CTL_DIR,	NET_DECNET_CONF,		"conf",	bin_net_decnet_conf },
-	{ CTL_DNADR,	NET_DECNET_NODE_ADDRESS,	"node_address" },
-	{ CTL_STR,	NET_DECNET_NODE_NAME,		"node_name" },
-	{ CTL_STR,	NET_DECNET_DEFAULT_DEVICE,	"default_device" },
-	{ CTL_INT,	NET_DECNET_TIME_WAIT,		"time_wait" },
-	{ CTL_INT,	NET_DECNET_DN_COUNT,		"dn_count" },
-	{ CTL_INT,	NET_DECNET_DI_COUNT,		"di_count" },
-	{ CTL_INT,	NET_DECNET_DR_COUNT,		"dr_count" },
-	{ CTL_INT,	NET_DECNET_DST_GC_INTERVAL,	"dst_gc_interval" },
-	{ CTL_INT,	NET_DECNET_NO_FC_MAX_CWND,	"no_fc_max_cwnd" },
-	{ CTL_INT,	NET_DECNET_MEM,		"decnet_mem" },
-	{ CTL_INT,	NET_DECNET_RMEM,		"decnet_rmem" },
-	{ CTL_INT,	NET_DECNET_WMEM,		"decnet_wmem" },
-	{ CTL_INT,	NET_DECNET_DEBUG_LEVEL,	"debug" },
-	{}
-};
-
-static const struct bin_table bin_net_sctp_table[] = {
-	{ CTL_INT,	NET_SCTP_RTO_INITIAL,		"rto_initial" },
-	{ CTL_INT,	NET_SCTP_RTO_MIN,		"rto_min" },
-	{ CTL_INT,	NET_SCTP_RTO_MAX,		"rto_max" },
-	{ CTL_INT,	NET_SCTP_RTO_ALPHA,		"rto_alpha_exp_divisor" },
-	{ CTL_INT,	NET_SCTP_RTO_BETA,		"rto_beta_exp_divisor" },
-	{ CTL_INT,	NET_SCTP_VALID_COOKIE_LIFE,	"valid_cookie_life" },
-	{ CTL_INT,	NET_SCTP_ASSOCIATION_MAX_RETRANS,	"association_max_retrans" },
-	{ CTL_INT,	NET_SCTP_PATH_MAX_RETRANS,	"path_max_retrans" },
-	{ CTL_INT,	NET_SCTP_MAX_INIT_RETRANSMITS,	"max_init_retransmits" },
-	{ CTL_INT,	NET_SCTP_HB_INTERVAL,		"hb_interval" },
-	{ CTL_INT,	NET_SCTP_PRESERVE_ENABLE,	"cookie_preserve_enable" },
-	{ CTL_INT,	NET_SCTP_MAX_BURST,		"max_burst" },
-	{ CTL_INT,	NET_SCTP_ADDIP_ENABLE,		"addip_enable" },
-	{ CTL_INT,	NET_SCTP_PRSCTP_ENABLE,		"prsctp_enable" },
-	{ CTL_INT,	NET_SCTP_SNDBUF_POLICY,		"sndbuf_policy" },
-	{ CTL_INT,	NET_SCTP_SACK_TIMEOUT,		"sack_timeout" },
-	{ CTL_INT,	NET_SCTP_RCVBUF_POLICY,		"rcvbuf_policy" },
-	{}
-};
-
-static const struct bin_table bin_net_llc_llc2_timeout_table[] = {
-	{ CTL_INT,	NET_LLC2_ACK_TIMEOUT,	"ack" },
-	{ CTL_INT,	NET_LLC2_P_TIMEOUT,	"p" },
-	{ CTL_INT,	NET_LLC2_REJ_TIMEOUT,	"rej" },
-	{ CTL_INT,	NET_LLC2_BUSY_TIMEOUT,	"busy" },
-	{}
-};
-
-static const struct bin_table bin_net_llc_station_table[] = {
-	{ CTL_INT,	NET_LLC_STATION_ACK_TIMEOUT,	"ack_timeout" },
-	{}
-};
-
-static const struct bin_table bin_net_llc_llc2_table[] = {
-	{ CTL_DIR,	NET_LLC2,		"timeout",	bin_net_llc_llc2_timeout_table },
-	{}
-};
-
-static const struct bin_table bin_net_llc_table[] = {
-	{ CTL_DIR,	NET_LLC2,		"llc2",		bin_net_llc_llc2_table },
-	{ CTL_DIR,	NET_LLC_STATION,	"station",	bin_net_llc_station_table },
-	{}
-};
-
-static const struct bin_table bin_net_netfilter_table[] = {
-	{ CTL_INT,	NET_NF_CONNTRACK_MAX,			"nf_conntrack_max" },
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */
-	/* NET_NF_CONNTRACK_UDP_TIMEOUT	"nf_conntrack_udp_timeout" no longer used */
-	/* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */
-	/* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */
-	/* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */
-	{ CTL_INT,	NET_NF_CONNTRACK_BUCKETS,		"nf_conntrack_buckets" },
-	{ CTL_INT,	NET_NF_CONNTRACK_LOG_INVALID,		"nf_conntrack_log_invalid" },
-	/* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */
-	{ CTL_INT,	NET_NF_CONNTRACK_TCP_LOOSE,		"nf_conntrack_tcp_loose" },
-	{ CTL_INT,	NET_NF_CONNTRACK_TCP_BE_LIBERAL,	"nf_conntrack_tcp_be_liberal" },
-	{ CTL_INT,	NET_NF_CONNTRACK_TCP_MAX_RETRANS,	"nf_conntrack_tcp_max_retrans" },
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */
-	/* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */
-	{ CTL_INT,	NET_NF_CONNTRACK_COUNT,			"nf_conntrack_count" },
-	/* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */
-	/* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */
-	{ CTL_INT,	NET_NF_CONNTRACK_FRAG6_LOW_THRESH,	"nf_conntrack_frag6_low_thresh" },
-	{ CTL_INT,	NET_NF_CONNTRACK_FRAG6_HIGH_THRESH,	"nf_conntrack_frag6_high_thresh" },
-	{ CTL_INT,	NET_NF_CONNTRACK_CHECKSUM,		"nf_conntrack_checksum" },
-
-	{}
-};
-
-static const struct bin_table bin_net_table[] = {
-	{ CTL_DIR,	NET_CORE,		"core",		bin_net_core_table },
-	/* NET_ETHER not used */
-	/* NET_802 not used */
-	{ CTL_DIR,	NET_UNIX,		"unix",		bin_net_unix_table },
-	{ CTL_DIR,	NET_IPV4,		"ipv4",		bin_net_ipv4_table },
-	{ CTL_DIR,	NET_IPX,		"ipx",		bin_net_ipx_table },
-	{ CTL_DIR,	NET_ATALK,		"appletalk",	bin_net_atalk_table },
-	{ CTL_DIR,	NET_NETROM,		"netrom",	bin_net_netrom_table },
-	{ CTL_DIR,	NET_AX25,		"ax25",		bin_net_ax25_table },
-	/*  NET_BRIDGE "bridge" no longer used */
-	{ CTL_DIR,	NET_ROSE,		"rose",		bin_net_rose_table },
-	{ CTL_DIR,	NET_IPV6,		"ipv6",		bin_net_ipv6_table },
-	{ CTL_DIR,	NET_X25,		"x25",		bin_net_x25_table },
-	{ CTL_DIR,	NET_TR,			"token-ring",	bin_net_tr_table },
-	{ CTL_DIR,	NET_DECNET,		"decnet",	bin_net_decnet_table },
-	/*  NET_ECONET not used */
-	{ CTL_DIR,	NET_SCTP,		"sctp",		bin_net_sctp_table },
-	{ CTL_DIR,	NET_LLC,		"llc",		bin_net_llc_table },
-	{ CTL_DIR,	NET_NETFILTER,		"netfilter",	bin_net_netfilter_table },
-	/* NET_DCCP "dccp" no longer used */
-	/* NET_IRDA "irda" no longer used */
-	{ CTL_INT,	2089,			"nf_conntrack_max" },
-	{}
-};
-
-static const struct bin_table bin_fs_quota_table[] = {
-	{ CTL_INT,	FS_DQ_LOOKUPS,		"lookups" },
-	{ CTL_INT,	FS_DQ_DROPS,		"drops" },
-	{ CTL_INT,	FS_DQ_READS,		"reads" },
-	{ CTL_INT,	FS_DQ_WRITES,		"writes" },
-	{ CTL_INT,	FS_DQ_CACHE_HITS,	"cache_hits" },
-	{ CTL_INT,	FS_DQ_ALLOCATED,	"allocated_dquots" },
-	{ CTL_INT,	FS_DQ_FREE,		"free_dquots" },
-	{ CTL_INT,	FS_DQ_SYNCS,		"syncs" },
-	{ CTL_INT,	FS_DQ_WARNINGS,		"warnings" },
-	{}
-};
-
-static const struct bin_table bin_fs_xfs_table[] = {
-	{ CTL_INT,	XFS_SGID_INHERIT,	"irix_sgid_inherit" },
-	{ CTL_INT,	XFS_SYMLINK_MODE,	"irix_symlink_mode" },
-	{ CTL_INT,	XFS_PANIC_MASK,		"panic_mask" },
-
-	{ CTL_INT,	XFS_ERRLEVEL,		"error_level" },
-	{ CTL_INT,	XFS_SYNCD_TIMER,	"xfssyncd_centisecs" },
-	{ CTL_INT,	XFS_INHERIT_SYNC,	"inherit_sync" },
-	{ CTL_INT,	XFS_INHERIT_NODUMP,	"inherit_nodump" },
-	{ CTL_INT,	XFS_INHERIT_NOATIME,	"inherit_noatime" },
-	{ CTL_INT,	XFS_BUF_TIMER,		"xfsbufd_centisecs" },
-	{ CTL_INT,	XFS_BUF_AGE,		"age_buffer_centisecs" },
-	{ CTL_INT,	XFS_INHERIT_NOSYM,	"inherit_nosymlinks" },
-	{ CTL_INT,	XFS_ROTORSTEP,	"rotorstep" },
-	{ CTL_INT,	XFS_INHERIT_NODFRG,	"inherit_nodefrag" },
-	{ CTL_INT,	XFS_FILESTREAM_TIMER,	"filestream_centisecs" },
-	{ CTL_INT,	XFS_STATS_CLEAR,	"stats_clear" },
-	{}
-};
-
-static const struct bin_table bin_fs_ocfs2_nm_table[] = {
-	{ CTL_STR,	1, "hb_ctl_path" },
-	{}
-};
-
-static const struct bin_table bin_fs_ocfs2_table[] = {
-	{ CTL_DIR,	1,	"nm",	bin_fs_ocfs2_nm_table },
-	{}
-};
-
-static const struct bin_table bin_inotify_table[] = {
-	{ CTL_INT,	INOTIFY_MAX_USER_INSTANCES,	"max_user_instances" },
-	{ CTL_INT,	INOTIFY_MAX_USER_WATCHES,	"max_user_watches" },
-	{ CTL_INT,	INOTIFY_MAX_QUEUED_EVENTS,	"max_queued_events" },
-	{}
-};
-
-static const struct bin_table bin_fs_table[] = {
-	{ CTL_INT,	FS_NRINODE,		"inode-nr" },
-	{ CTL_INT,	FS_STATINODE,		"inode-state" },
-	/* FS_MAXINODE unused */
-	/* FS_NRDQUOT unused */
-	/* FS_MAXDQUOT unused */
-	/* FS_NRFILE "file-nr" no longer used */
-	{ CTL_INT,	FS_MAXFILE,		"file-max" },
-	{ CTL_INT,	FS_DENTRY,		"dentry-state" },
-	/* FS_NRSUPER unused */
-	/* FS_MAXUPSER unused */
-	{ CTL_INT,	FS_OVERFLOWUID,		"overflowuid" },
-	{ CTL_INT,	FS_OVERFLOWGID,		"overflowgid" },
-	{ CTL_INT,	FS_LEASES,		"leases-enable" },
-	{ CTL_INT,	FS_DIR_NOTIFY,		"dir-notify-enable" },
-	{ CTL_INT,	FS_LEASE_TIME,		"lease-break-time" },
-	{ CTL_DIR,	FS_DQSTATS,		"quota",	bin_fs_quota_table },
-	{ CTL_DIR,	FS_XFS,			"xfs",		bin_fs_xfs_table },
-	{ CTL_ULONG,	FS_AIO_NR,		"aio-nr" },
-	{ CTL_ULONG,	FS_AIO_MAX_NR,		"aio-max-nr" },
-	{ CTL_DIR,	FS_INOTIFY,		"inotify",	bin_inotify_table },
-	{ CTL_DIR,	FS_OCFS2,		"ocfs2",	bin_fs_ocfs2_table },
-	{ CTL_INT,	KERN_SETUID_DUMPABLE,	"suid_dumpable" },
-	{}
-};
-
-static const struct bin_table bin_ipmi_table[] = {
-	{ CTL_INT,	DEV_IPMI_POWEROFF_POWERCYCLE,	"poweroff_powercycle" },
-	{}
-};
-
-static const struct bin_table bin_mac_hid_files[] = {
-	/* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */
-	/* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */
-	{ CTL_INT,	DEV_MAC_HID_MOUSE_BUTTON_EMULATION,	"mouse_button_emulation" },
-	{ CTL_INT,	DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE,	"mouse_button2_keycode" },
-	{ CTL_INT,	DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE,	"mouse_button3_keycode" },
-	/* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */
-	{}
-};
-
-static const struct bin_table bin_raid_table[] = {
-	{ CTL_INT,	DEV_RAID_SPEED_LIMIT_MIN,	"speed_limit_min" },
-	{ CTL_INT,	DEV_RAID_SPEED_LIMIT_MAX,	"speed_limit_max" },
-	{}
-};
-
-static const struct bin_table bin_scsi_table[] = {
-	{ CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" },
-	{}
-};
-
-static const struct bin_table bin_dev_table[] = {
-	/* DEV_CDROM	"cdrom" no longer used */
-	/* DEV_HWMON unused */
-	/* DEV_PARPORT	"parport" no longer used */
-	{ CTL_DIR,	DEV_RAID,	"raid",		bin_raid_table },
-	{ CTL_DIR,	DEV_MAC_HID,	"mac_hid",	bin_mac_hid_files },
-	{ CTL_DIR,	DEV_SCSI,	"scsi",		bin_scsi_table },
-	{ CTL_DIR,	DEV_IPMI,	"ipmi",		bin_ipmi_table },
-	{}
-};
-
-static const struct bin_table bin_bus_isa_table[] = {
-	{ CTL_INT,	BUS_ISA_MEM_BASE,	"membase" },
-	{ CTL_INT,	BUS_ISA_PORT_BASE,	"portbase" },
-	{ CTL_INT,	BUS_ISA_PORT_SHIFT,	"portshift" },
-	{}
-};
-
-static const struct bin_table bin_bus_table[] = {
-	{ CTL_DIR,	CTL_BUS_ISA,	"isa",	bin_bus_isa_table },
-	{}
-};
-
-
-static const struct bin_table bin_s390dbf_table[] = {
-	{ CTL_INT,	5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
-	{ CTL_INT,	5679 /* CTL_S390DBF_ACTIVE */,	  "debug_active" },
-	{}
-};
-
-static const struct bin_table bin_sunrpc_table[] = {
-	/* CTL_RPCDEBUG	"rpc_debug"  no longer used */
-	/* CTL_NFSDEBUG "nfs_debug"  no longer used */
-	/* CTL_NFSDDEBUG "nfsd_debug" no longer used  */
-	/* CTL_NLMDEBUG "nlm_debug" no longer used */
-
-	{ CTL_INT,	CTL_SLOTTABLE_UDP,	"udp_slot_table_entries" },
-	{ CTL_INT,	CTL_SLOTTABLE_TCP,	"tcp_slot_table_entries" },
-	{ CTL_INT,	CTL_MIN_RESVPORT,	"min_resvport" },
-	{ CTL_INT,	CTL_MAX_RESVPORT,	"max_resvport" },
-	{}
-};
-
-static const struct bin_table bin_pm_table[] = {
-	/* frv specific */
-	/* 1 == CTL_PM_SUSPEND	"suspend"  no longer used" */
-	{ CTL_INT,	2 /* CTL_PM_CMODE */,		"cmode" },
-	{ CTL_INT,	3 /* CTL_PM_P0 */,		"p0" },
-	{ CTL_INT,	4 /* CTL_PM_CM */,		"cm" },
-	{}
-};
-
-static const struct bin_table bin_root_table[] = {
-	{ CTL_DIR,	CTL_KERN,	"kernel",	bin_kern_table },
-	{ CTL_DIR,	CTL_VM,		"vm",		bin_vm_table },
-	{ CTL_DIR,	CTL_NET,	"net",		bin_net_table },
-	/* CTL_PROC not used */
-	{ CTL_DIR,	CTL_FS,		"fs",		bin_fs_table },
-	/* CTL_DEBUG "debug" no longer used */
-	{ CTL_DIR,	CTL_DEV,	"dev",		bin_dev_table },
-	{ CTL_DIR,	CTL_BUS,	"bus",		bin_bus_table },
-	{ CTL_DIR,	CTL_ABI,	"abi" },
-	/* CTL_CPU not used */
-	/* CTL_ARLAN "arlan" no longer used */
-	{ CTL_DIR,	CTL_S390DBF,	"s390dbf",	bin_s390dbf_table },
-	{ CTL_DIR,	CTL_SUNRPC,	"sunrpc",	bin_sunrpc_table },
-	{ CTL_DIR,	CTL_PM,		"pm",		bin_pm_table },
-	{}
-};
-
-static ssize_t bin_dir(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	return -ENOTDIR;
-}
-
-
-static ssize_t bin_string(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	ssize_t result, copied = 0;
-
-	if (oldval && oldlen) {
-		char __user *lastp;
-		loff_t pos = 0;
-		int ch;
-
-		result = vfs_read(file, oldval, oldlen, &pos);
-		if (result < 0)
-			goto out;
-
-		copied = result;
-		lastp = oldval + copied - 1;
-
-		result = -EFAULT;
-		if (get_user(ch, lastp))
-			goto out;
-
-		/* Trim off the trailing newline */
-		if (ch == '\n') {
-			result = -EFAULT;
-			if (put_user('\0', lastp))
-				goto out;
-			copied -= 1;
-		}
-	}
-
-	if (newval && newlen) {
-		loff_t pos = 0;
-
-		result = vfs_write(file, newval, newlen, &pos);
-		if (result < 0)
-			goto out;
-	}
-
-	result = copied;
-out:
-	return result;
-}
-
-static ssize_t bin_intvec(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	ssize_t copied = 0;
-	char *buffer;
-	ssize_t result;
-
-	result = -ENOMEM;
-	buffer = kmalloc(BUFSZ, GFP_KERNEL);
-	if (!buffer)
-		goto out;
-
-	if (oldval && oldlen) {
-		unsigned __user *vec = oldval;
-		size_t length = oldlen / sizeof(*vec);
-		char *str, *end;
-		int i;
-		loff_t pos = 0;
-
-		result = kernel_read(file, buffer, BUFSZ - 1, &pos);
-		if (result < 0)
-			goto out_kfree;
-
-		str = buffer;
-		end = str + result;
-		*end++ = '\0';
-		for (i = 0; i < length; i++) {
-			unsigned long value;
-
-			value = simple_strtoul(str, &str, 10);
-			while (isspace(*str))
-				str++;
-			
-			result = -EFAULT;
-			if (put_user(value, vec + i))
-				goto out_kfree;
-
-			copied += sizeof(*vec);
-			if (!isdigit(*str))
-				break;
-		}
-	}
-
-	if (newval && newlen) {
-		unsigned __user *vec = newval;
-		size_t length = newlen / sizeof(*vec);
-		char *str, *end;
-		int i;
-		loff_t pos = 0;
-
-		str = buffer;
-		end = str + BUFSZ;
-		for (i = 0; i < length; i++) {
-			unsigned long value;
-
-			result = -EFAULT;
-			if (get_user(value, vec + i))
-				goto out_kfree;
-
-			str += scnprintf(str, end - str, "%lu\t", value);
-		}
-
-		result = kernel_write(file, buffer, str - buffer, &pos);
-		if (result < 0)
-			goto out_kfree;
-	}
-	result = copied;
-out_kfree:
-	kfree(buffer);
-out:
-	return result;
-}
-
-static ssize_t bin_ulongvec(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	ssize_t copied = 0;
-	char *buffer;
-	ssize_t result;
-
-	result = -ENOMEM;
-	buffer = kmalloc(BUFSZ, GFP_KERNEL);
-	if (!buffer)
-		goto out;
-
-	if (oldval && oldlen) {
-		unsigned long __user *vec = oldval;
-		size_t length = oldlen / sizeof(*vec);
-		char *str, *end;
-		int i;
-		loff_t pos = 0;
-
-		result = kernel_read(file, buffer, BUFSZ - 1, &pos);
-		if (result < 0)
-			goto out_kfree;
-
-		str = buffer;
-		end = str + result;
-		*end++ = '\0';
-		for (i = 0; i < length; i++) {
-			unsigned long value;
-
-			value = simple_strtoul(str, &str, 10);
-			while (isspace(*str))
-				str++;
-			
-			result = -EFAULT;
-			if (put_user(value, vec + i))
-				goto out_kfree;
-
-			copied += sizeof(*vec);
-			if (!isdigit(*str))
-				break;
-		}
-	}
-
-	if (newval && newlen) {
-		unsigned long __user *vec = newval;
-		size_t length = newlen / sizeof(*vec);
-		char *str, *end;
-		int i;
-		loff_t pos = 0;
-
-		str = buffer;
-		end = str + BUFSZ;
-		for (i = 0; i < length; i++) {
-			unsigned long value;
-
-			result = -EFAULT;
-			if (get_user(value, vec + i))
-				goto out_kfree;
-
-			str += scnprintf(str, end - str, "%lu\t", value);
-		}
-
-		result = kernel_write(file, buffer, str - buffer, &pos);
-		if (result < 0)
-			goto out_kfree;
-	}
-	result = copied;
-out_kfree:
-	kfree(buffer);
-out:
-	return result;
-}
-
-static ssize_t bin_uuid(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	ssize_t result, copied = 0;
-
-	/* Only supports reads */
-	if (oldval && oldlen) {
-		char buf[UUID_STRING_LEN + 1];
-		uuid_t uuid;
-		loff_t pos = 0;
-
-		result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
-		if (result < 0)
-			goto out;
-
-		buf[result] = '\0';
-
-		result = -EIO;
-		if (uuid_parse(buf, &uuid))
-			goto out;
-
-		if (oldlen > 16)
-			oldlen = 16;
-
-		result = -EFAULT;
-		if (copy_to_user(oldval, &uuid, oldlen))
-			goto out;
-
-		copied = oldlen;
-	}
-	result = copied;
-out:
-	return result;
-}
-
-static ssize_t bin_dn_node_address(struct file *file,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	ssize_t result, copied = 0;
-
-	if (oldval && oldlen) {
-		char buf[15], *nodep;
-		unsigned long area, node;
-		__le16 dnaddr;
-		loff_t pos = 0;
-
-		result = kernel_read(file, buf, sizeof(buf) - 1, &pos);
-		if (result < 0)
-			goto out;
-
-		buf[result] = '\0';
-
-		/* Convert the decnet address to binary */
-		result = -EIO;
-		nodep = strchr(buf, '.');
-		if (!nodep)
-			goto out;
-		++nodep;
-
-		area = simple_strtoul(buf, NULL, 10);
-		node = simple_strtoul(nodep, NULL, 10);
-
-		result = -EIO;
-		if ((area > 63)||(node > 1023))
-			goto out;
-
-		dnaddr = cpu_to_le16((area << 10) | node);
-
-		result = -EFAULT;
-		if (put_user(dnaddr, (__le16 __user *)oldval))
-			goto out;
-
-		copied = sizeof(dnaddr);
-	}
-
-	if (newval && newlen) {
-		__le16 dnaddr;
-		char buf[15];
-		int len;
-		loff_t pos = 0;
-
-		result = -EINVAL;
-		if (newlen != sizeof(dnaddr))
-			goto out;
-
-		result = -EFAULT;
-		if (get_user(dnaddr, (__le16 __user *)newval))
-			goto out;
-
-		len = scnprintf(buf, sizeof(buf), "%hu.%hu",
-				le16_to_cpu(dnaddr) >> 10,
-				le16_to_cpu(dnaddr) & 0x3ff);
-
-		result = kernel_write(file, buf, len, &pos);
-		if (result < 0)
-			goto out;
-	}
-
-	result = copied;
-out:
-	return result;
-}
-
-static const struct bin_table *get_sysctl(const int *name, int nlen, char *path)
-{
-	const struct bin_table *table = &bin_root_table[0];
-	int ctl_name;
-
-	/* The binary sysctl tables have a small maximum depth so
-	 * there is no danger of overflowing our path as it PATH_MAX
-	 * bytes long.
-	 */
-	memcpy(path, "sys/", 4);
-	path += 4;
-
-repeat:
-	if (!nlen)
-		return ERR_PTR(-ENOTDIR);
-	ctl_name = *name;
-	name++;
-	nlen--;
-	for ( ; table->convert; table++) {
-		int len = 0;
-
-		/*
-		 * For a wild card entry map from ifindex to network
-		 * device name.
-		 */
-		if (!table->ctl_name) {
-#ifdef CONFIG_NET
-			struct net *net = current->nsproxy->net_ns;
-			struct net_device *dev;
-			dev = dev_get_by_index(net, ctl_name);
-			if (dev) {
-				len = strlen(dev->name);
-				memcpy(path, dev->name, len);
-				dev_put(dev);
-			}
-#endif
-		/* Use the well known sysctl number to proc name mapping */
-		} else if (ctl_name == table->ctl_name) {
-			len = strlen(table->procname);
-			memcpy(path, table->procname, len);
-		}
-		if (len) {
-			path += len;
-			if (table->child) {
-				*path++ = '/';
-				table = table->child;
-				goto repeat;
-			}
-			*path = '\0';
-			return table;
-		}
-	}
-	return ERR_PTR(-ENOTDIR);
-}
-
-static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep)
-{
-	char *tmp, *result;
-
-	result = ERR_PTR(-ENOMEM);
-	tmp = __getname();
-	if (tmp) {
-		const struct bin_table *table = get_sysctl(name, nlen, tmp);
-		result = tmp;
-		*tablep = table;
-		if (IS_ERR(table)) {
-			__putname(tmp);
-			result = ERR_CAST(table);
-		}
-	}
-	return result;
-}
-
-static ssize_t binary_sysctl(const int *name, int nlen,
-	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
-{
-	const struct bin_table *table = NULL;
-	struct vfsmount *mnt;
-	struct file *file;
-	ssize_t result;
-	char *pathname;
-	int flags;
-
-	pathname = sysctl_getname(name, nlen, &table);
-	result = PTR_ERR(pathname);
-	if (IS_ERR(pathname))
-		goto out;
-
-	/* How should the sysctl be accessed? */
-	if (oldval && oldlen && newval && newlen) {
-		flags = O_RDWR;
-	} else if (newval && newlen) {
-		flags = O_WRONLY;
-	} else if (oldval && oldlen) {
-		flags = O_RDONLY;
-	} else {
-		result = 0;
-		goto out_putname;
-	}
-
-	mnt = task_active_pid_ns(current)->proc_mnt;
-	file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0);
-	result = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_putname;
-
-	result = table->convert(file, oldval, oldlen, newval, newlen);
-
-	fput(file);
-out_putname:
-	__putname(pathname);
-out:
-	return result;
-}
-
-
-#else /* CONFIG_SYSCTL_SYSCALL */
-
 static ssize_t binary_sysctl(const int *name, int nlen,
 	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
 	return -ENOSYS;
 }
 
-#endif /* CONFIG_SYSCTL_SYSCALL */
-
-
 static void deprecated_sysctl_warning(const int *name, int nlen)
 {
 	int i;
-- 
cgit v1.2.3


From ff68dac6d65cd1347dad5d780dd8c90f29dc1b0b Mon Sep 17 00:00:00 2001
From: Gaowei Pu <pugaowei@gmail.com>
Date: Sat, 30 Nov 2019 17:51:03 -0800
Subject: mm/mmap.c: use IS_ERR_VALUE to check return value of
 get_unmapped_area
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_unmapped_area() returns an address or -errno on failure.  Historically
we have checked for the failure by offset_in_page() which is correct but
quite hard to read.  Newer code started using IS_ERR_VALUE which is much
easier to read.  Convert remaining users of offset_in_page as well.

[mhocko@suse.com: rewrite changelog]
[mhocko@kernel.org: fix mremap.c and uprobes.c sites also]
Link: http://lkml.kernel.org/r/20191012102512.28051-1-pugaowei@gmail.com
Signed-off-by: Gaowei Pu <pugaowei@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Jérôme Glisse" <jglisse@redhat.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c74761004ee5..ece7e13f6e4a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 		/* Try to map as high as possible, this is only a hint. */
 		area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
 						PAGE_SIZE, 0, 0);
-		if (area->vaddr & ~PAGE_MASK) {
+		if (IS_ERR_VALUE(area->vaddr)) {
 			ret = area->vaddr;
 			goto fail;
 		}
-- 
cgit v1.2.3


From eafb149ed73a8bb8359c0ce027b98acd4e95b070 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Sat, 30 Nov 2019 17:54:57 -0800
Subject: fork: support VMAP_STACK with KASAN_VMALLOC

Supporting VMAP_STACK with KASAN_VMALLOC is straightforward:

 - clear the shadow region of vmapped stacks when swapping them in
 - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN

Link: http://lkml.kernel.org/r/20191031093909.9228-4-dja@axtens.net
Signed-off-by: Daniel Axtens <dja@axtens.net>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0f0bac8318dd..21c6c1e29b98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -93,6 +93,7 @@
 #include <linux/livepatch.h>
 #include <linux/thread_info.h>
 #include <linux/stackleak.h>
+#include <linux/kasan.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
 		if (!s)
 			continue;
 
+		/* Clear the KASAN shadow of the stack. */
+		kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+
 		/* Clear stale pointers from reused stack. */
 		memset(s->addr, 0, THREAD_SIZE);
 
-- 
cgit v1.2.3


From 204cb79ad42f015312a5bbd7012d09c93d9b46fb Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat, 30 Nov 2019 17:56:08 -0800
Subject: kernel: sysctl: make drop_caches write-only

Currently, the drop_caches proc file and sysctl read back the last value
written, suggesting this is somehow a stateful setting instead of a
one-time command.  Make it write-only, like e.g.  compact_memory.

While mitigating a VM problem at scale in our fleet, there was confusion
about whether writing to this file will permanently switch the kernel into
a non-caching mode.  This influences the decision making in a tense
situation, where tens of people are trying to fix tens of thousands of
affected machines: Do we need a rollback strategy?  What are the
performance implications of operating in a non-caching state for several
days?  It also caused confusion when the kernel team said we may need to
write the file several times to make sure it's effective ("But it already
reads back 3?").

Link: http://lkml.kernel.org/r/20191031221602.9375-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b6f2f35d0bcf..70665934d53e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = {
 		.procname	= "drop_caches",
 		.data		= &sysctl_drop_caches,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0200,
 		.proc_handler	= drop_caches_sysctl_handler,
 		.extra1		= SYSCTL_ONE,
 		.extra2		= &four,
-- 
cgit v1.2.3