From 245d73698ed7abdc7e520dfa38048bb80ce89571 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 2 Oct 2019 16:41:58 -0700 Subject: audit: Report suspicious O_CREAT usage This renames the very specific audit_log_link_denied() to audit_log_path_denied() and adds the AUDIT_* type as an argument. This allows for the creation of the new AUDIT_ANOM_CREAT that can be used to report the fifo/regular file creation restrictions that were introduced in commit 30aba6656f61 ("namei: allow restricted O_CREAT of FIFOs and regular files"). Signed-off-by: Kees Cook Signed-off-by: Paul Moore --- kernel/audit.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index da8dc0db5bd3..d75485aa25ff 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2155,18 +2155,19 @@ void audit_log_task_info(struct audit_buffer *ab) EXPORT_SYMBOL(audit_log_task_info); /** - * audit_log_link_denied - report a link restriction denial - * @operation: specific link operation + * audit_log_path_denied - report a path restriction denial + * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc) + * @operation: specific operation name */ -void audit_log_link_denied(const char *operation) +void audit_log_path_denied(int type, const char *operation) { struct audit_buffer *ab; if (!audit_enabled || audit_dummy_context()) return; - /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */ - ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_ANOM_LINK); + /* Generate log with subject, operation, outcome. */ + ab = audit_log_start(audit_context(), GFP_KERNEL, type); if (!ab) return; audit_log_format(ab, "op=%s", operation); -- cgit v1.2.3 From 0f8b5b6d56b5fa4085c06945ea3e1ee5941ecfeb Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:17 -0700 Subject: kgdb: Remove unused DCPU_SSTEP definition From doing a 'git log --patch kernel/debug', it looks as if DCPU_SSTEP has never been used. Presumably it used to be used back when kgdb was out of tree and nobody thought to delete the definition when the usage went away. Delete. Signed-off-by: Douglas Anderson Acked-by: Jason Wessel Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index b4a7c326d546..804b0fe5a0ba 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -33,7 +33,6 @@ struct kgdb_state { #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ -#define DCPU_SSTEP 0x8 /* CPU is single stepping */ struct debuggerinfo_struct { void *debuggerinfo; -- cgit v1.2.3 From 54af3e39eed7d77f0923511f3c7f446e7d477635 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:18 -0700 Subject: kdb: Remove unused "argcount" param from kdb_bt1(); make btaprompt bool The kdb_bt1() had a mysterious "argcount" parameter passed in (always the number 5, by the way) and never used. Presumably this is just old cruft. Remove it. While at it, upgrade the btaprompt parameter to a full fledged bool instead of an int. Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7e2379aa0a1e..120fc686c919 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -78,8 +78,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) */ static int -kdb_bt1(struct task_struct *p, unsigned long mask, - int argcount, int btaprompt) +kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) { char buffer[2]; if (kdb_getarea(buffer[0], (unsigned long)p) || @@ -106,7 +105,6 @@ int kdb_bt(int argc, const char **argv) { int diag; - int argcount = 5; int btaprompt = 1; int nextarg; unsigned long addr; @@ -125,7 +123,7 @@ kdb_bt(int argc, const char **argv) /* Run the active tasks first */ for_each_online_cpu(cpu) { p = kdb_curr_task(cpu); - if (kdb_bt1(p, mask, argcount, btaprompt)) + if (kdb_bt1(p, mask, btaprompt)) return 0; } /* Now the inactive tasks */ @@ -134,7 +132,7 @@ kdb_bt(int argc, const char **argv) return 0; if (task_curr(p)) continue; - if (kdb_bt1(p, mask, argcount, btaprompt)) + if (kdb_bt1(p, mask, btaprompt)) return 0; } kdb_while_each_thread(g, p); } else if (strcmp(argv[0], "btp") == 0) { @@ -148,7 +146,7 @@ kdb_bt(int argc, const char **argv) p = find_task_by_pid_ns(pid, &init_pid_ns); if (p) { kdb_set_current_task(p); - return kdb_bt1(p, ~0UL, argcount, 0); + return kdb_bt1(p, ~0UL, false); } kdb_printf("No process with pid == %ld found\n", pid); return 0; @@ -159,7 +157,7 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; kdb_set_current_task((struct task_struct *)addr); - return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0); + return kdb_bt1((struct task_struct *)addr, ~0UL, false); } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; struct task_struct *save_current_task = kdb_current_task; @@ -211,7 +209,7 @@ kdb_bt(int argc, const char **argv) kdb_show_stack(kdb_current_task, (void *)addr); return 0; } else { - return kdb_bt1(kdb_current_task, ~0UL, argcount, 0); + return kdb_bt1(kdb_current_task, ~0UL, false); } } -- cgit v1.2.3 From 55a7e23f461fc2c321d7efcdeca1750085e9323f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:19 -0700 Subject: kdb: Fix "btc " crash if the CPU didn't round up I noticed that when I did "btc " and the CPU I passed in hadn't rounded up that I'd crash. I was going to copy the same fix from commit 162bc7f5afd7 ("kdb: Don't back trace on a cpu that didn't round up") into the "not all the CPUs" case, but decided it'd be better to clean things up a little bit. This consolidates the two code paths. It is _slightly_ wasteful in in that the checks for "cpu" being too small or being offline isn't really needed when we're iterating over all online CPUs, but that really shouldn't hurt. Better to have the same code path. While at it, eliminate at least one slightly ugly (and totally needless) recursive use of kdb_parse(). Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/kdb/kdb_bt.c | 61 ++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 120fc686c919..d9af139f9a31 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -101,6 +101,27 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) return 0; } +static void +kdb_bt_cpu(unsigned long cpu) +{ + struct task_struct *kdb_tsk; + + if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { + kdb_printf("WARNING: no process for cpu %ld\n", cpu); + return; + } + + /* If a CPU failed to round up we could be here */ + kdb_tsk = KDB_TSK(cpu); + if (!kdb_tsk) { + kdb_printf("WARNING: no task for cpu %ld\n", cpu); + return; + } + + kdb_set_current_task(kdb_tsk); + kdb_bt1(kdb_tsk, ~0UL, false); +} + int kdb_bt(int argc, const char **argv) { @@ -161,7 +182,6 @@ kdb_bt(int argc, const char **argv) } else if (strcmp(argv[0], "btc") == 0) { unsigned long cpu = ~0; struct task_struct *save_current_task = kdb_current_task; - char buf[80]; if (argc > 1) return KDB_ARGCOUNT; if (argc == 1) { @@ -169,35 +189,22 @@ kdb_bt(int argc, const char **argv) if (diag) return diag; } - /* Recursive use of kdb_parse, do not use argv after - * this point */ - argv = NULL; if (cpu != ~0) { - if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { - kdb_printf("no process for cpu %ld\n", cpu); - return 0; - } - sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); - kdb_parse(buf); - return 0; - } - kdb_printf("btc: cpu status: "); - kdb_parse("cpu\n"); - for_each_online_cpu(cpu) { - void *kdb_tsk = KDB_TSK(cpu); - - /* If a CPU failed to round up we could be here */ - if (!kdb_tsk) { - kdb_printf("WARNING: no task for cpu %ld\n", - cpu); - continue; + kdb_bt_cpu(cpu); + } else { + /* + * Recursive use of kdb_parse, do not use argv after + * this point. + */ + argv = NULL; + kdb_printf("btc: cpu status: "); + kdb_parse("cpu\n"); + for_each_online_cpu(cpu) { + kdb_bt_cpu(cpu); + touch_nmi_watchdog(); } - - sprintf(buf, "btt 0x%px\n", kdb_tsk); - kdb_parse(buf); - touch_nmi_watchdog(); + kdb_set_current_task(save_current_task); } - kdb_set_current_task(save_current_task); return 0; } else { if (argc) { -- cgit v1.2.3 From 2277b492582d5525244519f60da6f9daea5ef41a Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 25 Sep 2019 13:02:20 -0700 Subject: kdb: Fix stack crawling on 'running' CPUs that aren't the master In kdb when you do 'btc' (back trace on CPU) it doesn't necessarily give you the right info. Specifically on many architectures (including arm64, where I tested) you can't dump the stack of a "running" process that isn't the process running on the current CPU. This can be seen by this: echo SOFTLOCKUP > /sys/kernel/debug/provoke-crash/DIRECT # wait 2 seconds g Here's what I see now on rk3399-gru-kevin. I see the stack crawl for the CPU that handled the sysrq but everything else just shows me stuck in __switch_to() which is bogus: ====== [0]kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0, 1-3(I), 4, 5(I) Stack traceback for pid 0 0xffffff801101a9c0 0 0 1 0 R 0xffffff801101b3b0 *swapper/0 Call trace: dump_backtrace+0x0/0x138 ... kgdb_compiled_brk_fn+0x34/0x44 ... sysrq_handle_dbg+0x34/0x5c Stack traceback for pid 0 0xffffffc0f175a040 0 0 1 1 I 0xffffffc0f175aa30 swapper/1 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65616c0 Stack traceback for pid 0 0xffffffc0f175d040 0 0 1 2 I 0xffffffc0f175da30 swapper/2 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65806c0 Stack traceback for pid 0 0xffffffc0f175b040 0 0 1 3 I 0xffffffc0f175ba30 swapper/3 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f659f6c0 Stack traceback for pid 1474 0xffffffc0dde8b040 1474 727 1 4 R 0xffffffc0dde8ba30 bash Call trace: __switch_to+0x1e4/0x240 __schedule+0x464/0x618 0xffffffc0dde8b040 Stack traceback for pid 0 0xffffffc0f17b0040 0 0 1 5 I 0xffffffc0f17b0a30 swapper/5 Call trace: __switch_to+0x1e4/0x240 0xffffffc0f65dd6c0 === The problem is that 'btc' eventually boils down to show_stack(task_struct, NULL); ...and show_stack() doesn't work for "running" CPUs because their registers haven't been stashed. On x86 things might work better (I haven't tested) because kdb has a special case for x86 in kdb_show_stack() where it passes the stack pointer to show_stack(). This wouldn't work on arm64 where the stack crawling function seems needs the "fp" and "pc", not the "sp" which is presumably why arm64's show_stack() function totally ignores the "sp" parameter. NOTE: we _can_ get a good stack dump for all the cpus if we manually switch each one to the kdb master and do a back trace. AKA: cpu 4 bt ...will give the expected trace. That's because now arm64's dump_backtrace will now see that "tsk == current" and go through a different path. In this patch I fix the problems by catching a request to stack crawl a task that's running on a CPU and then I ask that CPU to do the stack crawl. NOTE: this will (presumably) change what stack crawls are printed for x86 machines. Now kdb functions will show up in the stack crawl. Presumably this is OK but if it's not we can go back and add a special case for x86 again. Signed-off-by: Douglas Anderson Acked-by: Will Deacon Signed-off-by: Daniel Thompson --- kernel/debug/debug_core.c | 34 ++++++++++++++++++++++++++++++++++ kernel/debug/debug_core.h | 2 ++ kernel/debug/kdb/kdb_bt.c | 19 +++++++------------ 3 files changed, 43 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index f76d6f77dd5e..70e86b4b4932 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -441,6 +441,37 @@ setundefined: return 0; } +#ifdef CONFIG_KGDB_KDB +void kdb_dump_stack_on_cpu(int cpu) +{ + if (cpu == raw_smp_processor_id()) { + dump_stack(); + return; + } + + if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) { + kdb_printf("ERROR: Task on cpu %d didn't stop in the debugger\n", + cpu); + return; + } + + /* + * In general, architectures don't support dumping the stack of a + * "running" process that's not the current one. From the point of + * view of the Linux, kernel processes that are looping in the kgdb + * slave loop are still "running". There's also no API (that actually + * works across all architectures) that can do a stack crawl based + * on registers passed as a parameter. + * + * Solve this conundrum by asking slave CPUs to do the backtrace + * themselves. + */ + kgdb_info[cpu].exception_state |= DCPU_WANT_BT; + while (kgdb_info[cpu].exception_state & DCPU_WANT_BT) + cpu_relax(); +} +#endif + /* * Return true if there is a valid kgdb I/O module. Also if no * debugger is attached a message can be printed to the console about @@ -580,6 +611,9 @@ cpu_loop: atomic_xchg(&kgdb_active, cpu); break; } + } else if (kgdb_info[cpu].exception_state & DCPU_WANT_BT) { + dump_stack(); + kgdb_info[cpu].exception_state &= ~DCPU_WANT_BT; } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { if (!raw_spin_is_locked(&dbg_slave_lock)) goto return_normal; diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h index 804b0fe5a0ba..cd22b5f68831 100644 --- a/kernel/debug/debug_core.h +++ b/kernel/debug/debug_core.h @@ -33,6 +33,7 @@ struct kgdb_state { #define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ #define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ #define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ +#define DCPU_WANT_BT 0x8 /* Slave cpu should backtrace then clear flag */ struct debuggerinfo_struct { void *debuggerinfo; @@ -75,6 +76,7 @@ extern int kdb_stub(struct kgdb_state *ks); extern int kdb_parse(const char *cmdstr); extern int kdb_common_init_state(struct kgdb_state *ks); extern int kdb_common_deinit_state(void); +extern void kdb_dump_stack_on_cpu(int cpu); #else /* ! CONFIG_KGDB_KDB */ static inline int kdb_stub(struct kgdb_state *ks) { diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index d9af139f9a31..0e94efe07b72 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -22,20 +22,15 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; - kdb_set_current_task(p); - if (addr) { - show_stack((struct task_struct *)p, addr); - } else if (kdb_current_regs) { -#ifdef CONFIG_X86 - show_stack(p, &kdb_current_regs->sp); -#else - show_stack(p, NULL); -#endif - } else { - show_stack(p, NULL); - } + + if (!addr && kdb_task_has_cpu(p)) + kdb_dump_stack_on_cpu(kdb_process_cpu(p)); + else + show_stack(p, addr); + console_loglevel = old_lvl; kdb_trap_printk--; } -- cgit v1.2.3 From fb3c5386b382d4097476ce9647260fc89b34afdb Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 20 Sep 2019 10:30:05 +0200 Subject: seccomp: add SECCOMP_USER_NOTIF_FLAG_CONTINUE This allows the seccomp notifier to continue a syscall. A positive discussion about this feature was triggered by a post to the ksummit-discuss mailing list (cf. [3]) and took place during KSummit (cf. [1]) and again at the containers/checkpoint-restore micro-conference at Linux Plumbers. Recently we landed seccomp support for SECCOMP_RET_USER_NOTIF (cf. [4]) which enables a process (watchee) to retrieve an fd for its seccomp filter. This fd can then be handed to another (usually more privileged) process (watcher). The watcher will then be able to receive seccomp messages about the syscalls having been performed by the watchee. This feature is heavily used in some userspace workloads. For example, it is currently used to intercept mknod() syscalls in user namespaces aka in containers. The mknod() syscall can be easily filtered based on dev_t. This allows us to only intercept a very specific subset of mknod() syscalls. Furthermore, mknod() is not possible in user namespaces toto coelo and so intercepting and denying syscalls that are not in the whitelist on accident is not a big deal. The watchee won't notice a difference. In contrast to mknod(), a lot of other syscall we intercept (e.g. setxattr()) cannot be easily filtered like mknod() because they have pointer arguments. Additionally, some of them might actually succeed in user namespaces (e.g. setxattr() for all "user.*" xattrs). Since we currently cannot tell seccomp to continue from a user notifier we are stuck with performing all of the syscalls in lieu of the container. This is a huge security liability since it is extremely difficult to correctly assume all of the necessary privileges of the calling task such that the syscall can be successfully emulated without escaping other additional security restrictions (think missing CAP_MKNOD for mknod(), or MS_NODEV on a filesystem etc.). This can be solved by telling seccomp to resume the syscall. One thing that came up in the discussion was the problem that another thread could change the memory after userspace has decided to let the syscall continue which is a well known TOCTOU with seccomp which is present in other ways already. The discussion showed that this feature is already very useful for any syscall without pointer arguments. For any accidentally intercepted non-pointer syscall it is safe to continue. For syscalls with pointer arguments there is a race but for any cautious userspace and the main usec cases the race doesn't matter. The notifier is intended to be used in a scenario where a more privileged watcher supervises the syscalls of lesser privileged watchee to allow it to get around kernel-enforced limitations by performing the syscall for it whenever deemed save by the watcher. Hence, if a user tricks the watcher into allowing a syscall they will either get a deny based on kernel-enforced restrictions later or they will have changed the arguments in such a way that they manage to perform a syscall with arguments that they would've been allowed to do anyway. In general, it is good to point out again, that the notifier fd was not intended to allow userspace to implement a security policy but rather to work around kernel security mechanisms in cases where the watcher knows that a given action is safe to perform. /* References */ [1]: https://linuxplumbersconf.org/event/4/contributions/560 [2]: https://linuxplumbersconf.org/event/4/contributions/477 [3]: https://lore.kernel.org/r/20190719093538.dhyopljyr5ns33qx@brauner.io [4]: commit 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Co-developed-by: Kees Cook Signed-off-by: Christian Brauner Reviewed-by: Tycho Andersen Cc: Andy Lutomirski Cc: Will Drewry CC: Tyler Hicks Link: https://lore.kernel.org/r/20190920083007.11475-2-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dba52a7db5e8..12d2227e5786 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -75,6 +75,7 @@ struct seccomp_knotif { /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ int error; long val; + u32 flags; /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ struct completion ready; @@ -732,11 +733,12 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } -static void seccomp_do_user_notification(int this_syscall, - struct seccomp_filter *match, - const struct seccomp_data *sd) +static int seccomp_do_user_notification(int this_syscall, + struct seccomp_filter *match, + const struct seccomp_data *sd) { int err; + u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; @@ -764,6 +766,7 @@ static void seccomp_do_user_notification(int this_syscall, if (err == 0) { ret = n.val; err = n.error; + flags = n.flags; } /* @@ -780,8 +783,14 @@ static void seccomp_do_user_notification(int this_syscall, list_del(&n.list); out: mutex_unlock(&match->notify_lock); + + /* Userspace requests to continue the syscall. */ + if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return 0; + syscall_set_return_value(current, task_pt_regs(current), err, ret); + return -1; } static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, @@ -867,8 +876,10 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, return 0; case SECCOMP_RET_USER_NOTIF: - seccomp_do_user_notification(this_syscall, match, sd); - goto skip; + if (seccomp_do_user_notification(this_syscall, match, sd)) + goto skip; + + return 0; case SECCOMP_RET_LOG: seccomp_log(this_syscall, 0, action, true); @@ -1087,7 +1098,11 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (copy_from_user(&resp, buf, sizeof(resp))) return -EFAULT; - if (resp.flags) + if (resp.flags & ~SECCOMP_USER_NOTIF_FLAG_CONTINUE) + return -EINVAL; + + if ((resp.flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) && + (resp.error || resp.val)) return -EINVAL; ret = mutex_lock_interruptible(&filter->notify_lock); @@ -1116,6 +1131,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, knotif->state = SECCOMP_NOTIFY_REPLIED; knotif->error = resp.error; knotif->val = resp.val; + knotif->flags = resp.flags; complete(&knotif->ready); out: mutex_unlock(&filter->notify_lock); -- cgit v1.2.3 From da6043fe85eb5ec621e34a92540735dcebbea134 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Wed, 25 Sep 2019 15:39:12 +0100 Subject: PM / hibernate: memory_bm_find_bit(): Tighten node optimisation When looking for a bit by number we make use of the cached result from the preceding lookup to speed up operation. Firstly we check if the requested pfn is within the cached zone and if not lookup the new zone. We then check if the offset for that pfn falls within the existing cached node. This happens regardless of whether the node is within the zone we are now scanning. With certain memory layouts it is possible for this to false trigger creating a temporary alias for the pfn to a different bit. This leads the hibernation code to free memory which it was never allocated with the expected fallout. Ensure the zone we are scanning matches the cached zone before considering the cached node. Deep thanks go to Andrea for many, many, many hours of hacking and testing that went into cornering this bug. Reported-by: Andrea Righi Tested-by: Andrea Righi Signed-off-by: Andy Whitcroft Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 83105874f255..26b9168321e7 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -734,8 +734,15 @@ zone_found: * We have found the zone. Now walk the radix tree to find the leaf node * for our PFN. */ + + /* + * If the zone we wish to scan is the the current zone and the + * pfn falls into the current node then we do not need to walk + * the tree. + */ node = bm->cur.node; - if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) + if (zone == bm->cur.zone && + ((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) goto node_found; node = zone->rtree; -- cgit v1.2.3 From ce4dd4429b3c7e4506870796f3b8b06d707d2928 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Oct 2019 15:13:41 +0100 Subject: Remove the nr_exclusive argument from __wake_up_sync_key() Remove the nr_exclusive argument from __wake_up_sync_key() and derived functions as everything seems to set it to 1. Note also that if it wasn't set to 1, it would clear WF_SYNC anyway. Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) --- kernel/exit.c | 2 +- kernel/sched/wait.c | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..a1ff25ef050e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1435,7 +1435,7 @@ static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode, void __wake_up_parent(struct task_struct *p, struct task_struct *parent) { __wake_up_sync_key(&parent->signal->wait_chldexit, - TASK_INTERRUPTIBLE, 1, p); + TASK_INTERRUPTIBLE, p); } static long do_wait(struct wait_opts *wo) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c1e566a114ca..b4b52361dab7 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -169,7 +169,6 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: opaque value to be passed to wakeup targets * * The sync wakeup differs that the waker knows that it will schedule @@ -183,26 +182,21 @@ EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); * accessing the task state. */ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, void *key) + void *key) { - int wake_flags = 1; /* XXX WF_SYNC */ - if (unlikely(!wq_head)) return; - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); + __wake_up_common_lock(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_sync_key); /* * __wake_up_sync - see __wake_up_sync_key() */ -void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_exclusive) +void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) { - __wake_up_sync_key(wq_head, mode, nr_exclusive, NULL); + __wake_up_sync_key(wq_head, mode, NULL); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ -- cgit v1.2.3 From d07ce4e32a8d68062c58a3e635619313c52d0bf7 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Mon, 21 Oct 2019 11:10:56 +0100 Subject: kdb: Avoid array subscript warnings on non-SMP builds Recent versions of gcc (reported on gcc-7.4) issue array subscript warnings for builds where SMP is not enabled. kernel/debug/debug_core.c: In function 'kdb_dump_stack_on_cpu': kernel/debug/debug_core.c:452:17: warning: array subscript is outside array +bounds [-Warray-bounds] if (!(kgdb_info[cpu].exception_state & DCPU_IS_SLAVE)) { ~~~~~~~~~^~~~~ kernel/debug/debug_core.c:469:33: warning: array subscript is outside array +bounds [-Warray-bounds] kgdb_info[cpu].exception_state |= DCPU_WANT_BT; kernel/debug/debug_core.c:470:18: warning: array subscript is outside array +bounds [-Warray-bounds] while (kgdb_info[cpu].exception_state & DCPU_WANT_BT) There is no bug here but there is scope to improve the code generation for non-SMP systems (whilst also silencing the warning). Reported-by: kbuild test robot Fixes: 2277b492582d ("kdb: Fix stack crawling on 'running' CPUs that aren't the master") Signed-off-by: Daniel Thompson Link: https://lore.kernel.org/r/20191021101057.23861-1-daniel.thompson@linaro.org Reviewed-by: Douglas Anderson --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 70e86b4b4932..2b7c9b67931d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -444,7 +444,7 @@ setundefined: #ifdef CONFIG_KGDB_KDB void kdb_dump_stack_on_cpu(int cpu) { - if (cpu == raw_smp_processor_id()) { + if (cpu == raw_smp_processor_id() || !IS_ENABLED(CONFIG_SMP)) { dump_stack(); return; } -- cgit v1.2.3 From c34c78dfc1fc68a1f5403f996de8ca62f298d7b2 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Wed, 23 Oct 2019 21:27:34 +0800 Subject: audit: remove redundant condition check in kauditd_thread() Warning is found by the code analysis tool: "the condition 'if(ac && rc < 0)' is redundant: ac" The @ac variable has been checked before. It can't be a null pointer here, so remove the redundant condition check. Signed-off-by: Yunfeng Ye Signed-off-by: Paul Moore --- kernel/audit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d75485aa25ff..8e09f0f55b4b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -830,7 +830,7 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_hold_queue, UNICAST_RETRIES, NULL, kauditd_rehold_skb); - if (ac && rc < 0) { + if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; @@ -840,7 +840,7 @@ static int kauditd_thread(void *dummy) rc = kauditd_send_queue(sk, portid, &audit_retry_queue, UNICAST_RETRIES, NULL, kauditd_hold_skb); - if (ac && rc < 0) { + if (rc < 0) { sk = NULL; auditd_reset(ac); goto main_queue; -- cgit v1.2.3 From 53b63136e81220cb2f8b541c03a1df9199896821 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:24 +0100 Subject: kdb: Tidy up code to handle escape sequences kdb_read_get_key() has extremely complex break/continue control flow managed by state variables and is very hard to review or modify. In particular the way the escape sequence handling interacts with the general control flow is hard to follow. Separate out the escape key handling, without changing the control flow. This makes the main body of the code easier to review. Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-2-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 128 ++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 3a5184eb6977..cfc054fd8097 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -49,6 +49,65 @@ static int kgdb_transition_check(char *buffer) return 0; } +/** + * kdb_handle_escape() - validity check on an accumulated escape sequence. + * @buf: Accumulated escape characters to be examined. Note that buf + * is not a string, it is an array of characters and need not be + * nil terminated. + * @sz: Number of accumulated escape characters. + * + * Return: -1 if the escape sequence is unwanted, 0 if it is incomplete, + * otherwise it returns a mapped key value to pass to the upper layers. + */ +static int kdb_handle_escape(char *buf, size_t sz) +{ + char *lastkey = buf + sz - 1; + + switch (sz) { + case 1: + if (*lastkey == '\e') + return 0; + break; + + case 2: /* \e */ + if (*lastkey == '[') + return 0; + break; + + case 3: + switch (*lastkey) { + case 'A': /* \e[A, up arrow */ + return 16; + case 'B': /* \e[B, down arrow */ + return 14; + case 'C': /* \e[C, right arrow */ + return 6; + case 'D': /* \e[D, left arrow */ + return 2; + case '1': /* \e[<1,3,4>], may be home, del, end */ + case '3': + case '4': + return 0; + } + break; + + case 4: + if (*lastkey == '~') { + switch (buf[2]) { + case '1': /* \e[1~, home */ + return 1; + case '3': /* \e[3~, del */ + return 4; + case '4': /* \e[4~, end */ + return 5; + } + } + break; + } + + return -1; +} + static int kdb_read_get_key(char *buffer, size_t bufsize) { #define ESCAPE_UDELAY 1000 @@ -102,68 +161,15 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) escape_delay = 2; continue; } - if (ped - escape_data == 1) { - /* \e */ - continue; - } else if (ped - escape_data == 2) { - /* \e */ - if (key != '[') - escape_delay = 2; - continue; - } else if (ped - escape_data == 3) { - /* \e[ */ - int mapkey = 0; - switch (key) { - case 'A': /* \e[A, up arrow */ - mapkey = 16; - break; - case 'B': /* \e[B, down arrow */ - mapkey = 14; - break; - case 'C': /* \e[C, right arrow */ - mapkey = 6; - break; - case 'D': /* \e[D, left arrow */ - mapkey = 2; - break; - case '1': /* dropthrough */ - case '3': /* dropthrough */ - /* \e[<1,3,4>], may be home, del, end */ - case '4': - mapkey = -1; - break; - } - if (mapkey != -1) { - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - } - continue; - } else if (ped - escape_data == 4) { - /* \e[<1,3,4> */ - int mapkey = 0; - if (key == '~') { - switch (escape_data[2]) { - case '1': /* \e[1~, home */ - mapkey = 1; - break; - case '3': /* \e[3~, del */ - mapkey = 4; - break; - case '4': /* \e[4~, end */ - mapkey = 5; - break; - } - } - if (mapkey > 0) { - escape_data[0] = mapkey; - escape_data[1] = '\0'; - } - escape_delay = 2; - continue; + + key = kdb_handle_escape(escape_data, ped - escape_data); + if (key > 0) { + escape_data[0] = key; + escape_data[1] = '\0'; } + if (key) + escape_delay = 2; + continue; } break; /* A key to process */ } -- cgit v1.2.3 From d04213af90935d8b247c1327c9ea142fc037165f Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:25 +0100 Subject: kdb: Simplify code to fetch characters from console Currently kdb_read_get_key() contains complex control flow that, on close inspection, turns out to be unnecessary. In particular: 1. It is impossible to enter the branch conditioned on (escape_delay == 1) except when the loop enters with (escape_delay == 2) allowing us to combine the branches. 2. Most of the code conditioned on (escape_delay == 2) simply modifies local data and then breaks out of the loop causing the function to return escape_data[0]. 3. Based on #2 there is not actually any need to ever explicitly set escape_delay to 2 because we it is much simpler to directly return escape_data[0] instead. 4. escape_data[0] is, for all but one exit path, known to be '\e'. Simplify the code based on these observations. There is a subtle (and harmless) change of behaviour resulting from this simplification: instead of letting the escape timeout after ~1998 milliseconds we now timeout after ~2000 milliseconds Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-3-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index cfc054fd8097..a92ceca29637 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -124,25 +124,18 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) touch_nmi_watchdog(); f = &kdb_poll_funcs[0]; } - if (escape_delay == 2) { - *ped = '\0'; - ped = escape_data; - --escape_delay; - } - if (escape_delay == 1) { - key = *ped++; - if (!*ped) - --escape_delay; - break; - } + key = (*f)(); + if (key == -1) { if (escape_delay) { udelay(ESCAPE_UDELAY); - --escape_delay; + if (--escape_delay == 0) + return '\e'; } continue; } + if (bufsize <= 2) { if (key == '\r') key = '\n'; @@ -150,27 +143,24 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) *buffer = '\0'; return -1; } + if (escape_delay == 0 && key == '\e') { escape_delay = ESCAPE_DELAY; ped = escape_data; f_escape = f; } if (escape_delay) { - *ped++ = key; - if (f_escape != f) { - escape_delay = 2; - continue; - } + if (f_escape != f) + return '\e'; + *ped++ = key; key = kdb_handle_escape(escape_data, ped - escape_data); - if (key > 0) { - escape_data[0] = key; - escape_data[1] = '\0'; - } - if (key) - escape_delay = 2; - continue; + if (key < 0) + return '\e'; + if (key == 0) + continue; } + break; /* A key to process */ } return key; -- cgit v1.2.3 From 4f27e824bf83dfc2f6dc1a54fae419be7cd335af Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:26 +0100 Subject: kdb: Remove special case logic from kdb_read() kdb_read() contains special case logic to force it exit after reading a single character. We can remove all the special case logic by directly calling the function to read a single character instead. This also allows us to tidy up the function prototype which, because it now matches getchar(), we can also rename in order to make its role clearer. This does involve some extra code to handle btaprompt properly but we don't mind the new lines of code here because the old code had some interesting problems (bad newline handling, treating unexpected characters like ). Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-4-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_bt.c | 22 +++++++++------ kernel/debug/kdb/kdb_io.c | 61 +++++++++++++++++++----------------------- kernel/debug/kdb/kdb_private.h | 1 + 3 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 0e94efe07b72..4af48ac53625 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -75,9 +75,10 @@ static void kdb_show_stack(struct task_struct *p, void *addr) static int kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) { - char buffer[2]; - if (kdb_getarea(buffer[0], (unsigned long)p) || - kdb_getarea(buffer[0], (unsigned long)(p+1)-1)) + char ch; + + if (kdb_getarea(ch, (unsigned long)p) || + kdb_getarea(ch, (unsigned long)(p+1)-1)) return KDB_BADADDR; if (!kdb_task_state(p, mask)) return 0; @@ -85,12 +86,17 @@ kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt) kdb_ps1(p); kdb_show_stack(p, NULL); if (btaprompt) { - kdb_getstr(buffer, sizeof(buffer), - "Enter to end, to continue:"); - if (buffer[0] == 'q') { - kdb_printf("\n"); + kdb_printf("Enter to end, or to continue:"); + do { + ch = kdb_getchar(); + } while (!strchr("\r\n q", ch)); + kdb_printf("\n"); + + /* reset the pager */ + kdb_nextline = 1; + + if (ch == 'q') return 1; - } } touch_nmi_watchdog(); return 0; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index a92ceca29637..9b6933d585b5 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -108,7 +108,22 @@ static int kdb_handle_escape(char *buf, size_t sz) return -1; } -static int kdb_read_get_key(char *buffer, size_t bufsize) +/** + * kdb_getchar() - Read a single character from a kdb console (or consoles). + * + * Other than polling the various consoles that are currently enabled, + * most of the work done in this function is dealing with escape sequences. + * + * An escape key could be the start of a vt100 control sequence such as \e[D + * (left arrow) or it could be a character in its own right. The standard + * method for detecting the difference is to wait for 2 seconds to see if there + * are any other characters. kdb is complicated by the lack of a timer service + * (interrupts are off), by multiple input sources. Escape sequence processing + * has to be done as states in the polling loop. + * + * Return: The key pressed or a control code derived from an escape sequence. + */ +char kdb_getchar(void) { #define ESCAPE_UDELAY 1000 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ @@ -126,7 +141,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) } key = (*f)(); - if (key == -1) { if (escape_delay) { udelay(ESCAPE_UDELAY); @@ -136,14 +150,6 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) continue; } - if (bufsize <= 2) { - if (key == '\r') - key = '\n'; - *buffer++ = key; - *buffer = '\0'; - return -1; - } - if (escape_delay == 0 && key == '\e') { escape_delay = ESCAPE_DELAY; ped = escape_data; @@ -184,17 +190,7 @@ static int kdb_read_get_key(char *buffer, size_t bufsize) * function. It is not reentrant - it relies on the fact * that while kdb is running on only one "master debug" cpu. * Remarks: - * - * The buffer size must be >= 2. A buffer size of 2 means that the caller only - * wants a single key. - * - * An escape key could be the start of a vt100 control sequence such as \e[D - * (left arrow) or it could be a character in its own right. The standard - * method for detecting the difference is to wait for 2 seconds to see if there - * are any other characters. kdb is complicated by the lack of a timer service - * (interrupts are off), by multiple input sources and by the need to sometimes - * return after just one key. Escape sequence processing has to be done as - * states in the polling loop. + * The buffer size must be >= 2. */ static char *kdb_read(char *buffer, size_t bufsize) @@ -229,9 +225,7 @@ static char *kdb_read(char *buffer, size_t bufsize) *cp = '\0'; kdb_printf("%s", buffer); poll_again: - key = kdb_read_get_key(buffer, bufsize); - if (key == -1) - return buffer; + key = kdb_getchar(); if (key != 9) tab = 0; switch (key) { @@ -742,7 +736,7 @@ kdb_printit: /* check for having reached the LINES number of printed lines */ if (kdb_nextline >= linecount) { - char buf1[16] = ""; + char ch; /* Watch out for recursion here. Any routine that calls * kdb_printf will come back through here. And kdb_read @@ -777,39 +771,38 @@ kdb_printit: if (logging) printk("%s", moreprompt); - kdb_read(buf1, 2); /* '2' indicates to return - * immediately after getting one key. */ + ch = kdb_getchar(); kdb_nextline = 1; /* Really set output line 1 */ /* empty and reset the buffer: */ kdb_buffer[0] = '\0'; next_avail = kdb_buffer; size_avail = sizeof(kdb_buffer); - if ((buf1[0] == 'q') || (buf1[0] == 'Q')) { + if ((ch == 'q') || (ch == 'Q')) { /* user hit q or Q */ KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */ KDB_STATE_CLEAR(PAGER); /* end of command output; back to normal mode */ kdb_grepping_flag = 0; kdb_printf("\n"); - } else if (buf1[0] == ' ') { + } else if (ch == ' ') { kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '\n') { + } else if (ch == '\n' || ch == '\r') { kdb_nextline = linecount - 1; kdb_printf("\r"); suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] == '/' && !kdb_grepping_flag) { + } else if (ch == '/' && !kdb_grepping_flag) { kdb_printf("\r"); kdb_getstr(kdb_grep_string, KDB_GREP_STRLEN, kdbgetenv("SEARCHPROMPT") ?: "search> "); *strchrnul(kdb_grep_string, '\n') = '\0'; kdb_grepping_flag += KDB_GREPPING_FLAG_SEARCH; suspend_grep = 1; /* for this recursion */ - } else if (buf1[0] && buf1[0] != '\n') { - /* user hit something other than enter */ + } else if (ch) { + /* user hit something unexpected */ suspend_grep = 1; /* for this recursion */ - if (buf1[0] != '/') + if (ch != '/') kdb_printf( "\nOnly 'q', 'Q' or '/' are processed at " "more prompt, input ignored\n"); diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 2118d8258b7c..55d052061ef9 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -210,6 +210,7 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig(struct task_struct *p, int sig); extern void kdb_meminfo_proc_show(void); +extern char kdb_getchar(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); -- cgit v1.2.3 From cdca8d8900dd33ce6b8b526e247d2a6009d05de0 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:27 +0100 Subject: kdb: Improve handling of characters from different input sources Currently if an escape timer is interrupted by a character from a different input source then the new character is discarded and the function returns '\e' (which will be discarded by the level above). It is hard to see why this would ever be the desired behaviour. Fix this to return the new character rather than the '\e'. This is a bigger refactor than might be expected because the new character needs to go through escape sequence detection. Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-5-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 9b6933d585b5..f794c0ca4557 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -127,10 +127,10 @@ char kdb_getchar(void) { #define ESCAPE_UDELAY 1000 #define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ - char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */ - char *ped = escape_data; + char buf[4]; /* longest vt100 escape sequence is 4 bytes */ + char *pbuf = buf; int escape_delay = 0; - get_char_func *f, *f_escape = NULL; + get_char_func *f, *f_prev = NULL; int key; for (f = &kdb_poll_funcs[0]; ; ++f) { @@ -150,26 +150,26 @@ char kdb_getchar(void) continue; } - if (escape_delay == 0 && key == '\e') { + /* + * When the first character is received (or we get a change + * input source) we set ourselves up to handle an escape + * sequences (just in case). + */ + if (f_prev != f) { + f_prev = f; + pbuf = buf; escape_delay = ESCAPE_DELAY; - ped = escape_data; - f_escape = f; - } - if (escape_delay) { - if (f_escape != f) - return '\e'; - - *ped++ = key; - key = kdb_handle_escape(escape_data, ped - escape_data); - if (key < 0) - return '\e'; - if (key == 0) - continue; } - break; /* A key to process */ + *pbuf++ = key; + key = kdb_handle_escape(buf, pbuf - buf); + if (key < 0) /* no escape sequence; return first character */ + return buf[0]; + if (key > 0) + return key; } - return key; + + unreachable(); } /* -- cgit v1.2.3 From c58ff643763c78bef12874ee39995c9f7f987bc2 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Fri, 25 Oct 2019 08:33:28 +0100 Subject: kdb: Tweak escape handling for vi users Currently if sequences such as "\ehelp\r" are delivered to the console then the h gets eaten by the escape handling code. Since pressing escape becomes something of a nervous twitch for vi users (and that escape doesn't have much effect at a shell prompt) it is more helpful to emit the 'h' than the '\e'. We don't simply choose to emit the final character for all escape sequences since that will do odd things for unsupported escape sequences (in other words we retain the existing behaviour once we see '\e['). Signed-off-by: Daniel Thompson Reviewed-by: Douglas Anderson Link: https://lore.kernel.org/r/20191025073328.643-6-daniel.thompson@linaro.org --- kernel/debug/kdb/kdb_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index f794c0ca4557..8bcdded5d61f 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -163,8 +163,8 @@ char kdb_getchar(void) *pbuf++ = key; key = kdb_handle_escape(buf, pbuf - buf); - if (key < 0) /* no escape sequence; return first character */ - return buf[0]; + if (key < 0) /* no escape sequence; return best character */ + return buf[pbuf - buf == 2 ? 1 : 0]; if (key > 0) return key; } -- cgit v1.2.3 From a445e940ea686fc60475564009821010eb213be3 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 30 Oct 2019 10:13:13 +0000 Subject: dma-mapping: fix handling of dma-ranges for reserved memory (again) Daniele reported that issue previously fixed in c41f9ea998f3 ("drivers: dma-coherent: Account dma_pfn_offset when used with device tree") reappear shortly after 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool") where fix was accidentally dropped. Lets put fix back in place and respect dma-ranges for reserved memory. Fixes: 43fc509c3efb ("dma-coherent: introduce interface for default DMA pool") Reported-by: Daniele Alessandrelli Tested-by: Daniele Alessandrelli Tested-by: Alexandre Torgue Signed-off-by: Vladimir Murzin Signed-off-by: Christoph Hellwig --- kernel/dma/coherent.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 545e3869b0e3..551b0eb7028a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -123,8 +123,9 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, return ret; } -static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, - ssize_t size, dma_addr_t *dma_handle) +static void *__dma_alloc_from_coherent(struct device *dev, + struct dma_coherent_mem *mem, + ssize_t size, dma_addr_t *dma_handle) { int order = get_order(size); unsigned long flags; @@ -143,7 +144,7 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem, /* * Memory was found in the coherent area. */ - *dma_handle = mem->device_base + (pageno << PAGE_SHIFT); + *dma_handle = dma_get_device_base(dev, mem) + (pageno << PAGE_SHIFT); ret = mem->virt_base + (pageno << PAGE_SHIFT); spin_unlock_irqrestore(&mem->spinlock, flags); memset(ret, 0, size); @@ -175,17 +176,18 @@ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, if (!mem) return 0; - *ret = __dma_alloc_from_coherent(mem, size, dma_handle); + *ret = __dma_alloc_from_coherent(dev, mem, size, dma_handle); return 1; } -void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) +void *dma_alloc_from_global_coherent(struct device *dev, ssize_t size, + dma_addr_t *dma_handle) { if (!dma_coherent_default_memory) return NULL; - return __dma_alloc_from_coherent(dma_coherent_default_memory, size, - dma_handle); + return __dma_alloc_from_coherent(dev, dma_coherent_default_memory, size, + dma_handle); } static int __dma_release_from_coherent(struct dma_coherent_mem *mem, -- cgit v1.2.3 From 9ff6aa027dbb98755f0265695354f2dd07c0d1ce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 28 Oct 2019 14:56:46 -0700 Subject: dma-debug: add a schedule point in debug_dma_dump_mappings() debug_dma_dump_mappings() can take a lot of cpu cycles : lpk43:/# time wc -l /sys/kernel/debug/dma-api/dump 163435 /sys/kernel/debug/dma-api/dump real 0m0.463s user 0m0.003s sys 0m0.459s Let's add a cond_resched() to avoid holding cpu for too long. Signed-off-by: Eric Dumazet Cc: Corentin Labbe Cc: Christoph Hellwig Cc: Marek Szyprowski Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 099002d84f46..4ad74f5987ea 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -420,6 +420,7 @@ void debug_dma_dump_mappings(struct device *dev) } spin_unlock_irqrestore(&bucket->lock, flags); + cond_resched(); } } -- cgit v1.2.3 From ca66536845cd55c6a5fccd82694dcc87ed970780 Mon Sep 17 00:00:00 2001 From: Shyam Saini Date: Sun, 20 Oct 2019 10:33:22 +0530 Subject: kernel: dma-contiguous: mark CMA parameters __initdata/__initconst These parameters are only referenced by __init routine calls during early boot so they should be marked as __initdata and __initconst accordingly. Signed-off-by: Shyam Saini Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 69cfb4345388..daa4e6eefdde 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -42,10 +42,11 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes = (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; -static phys_addr_t size_cmdline = -1; -static phys_addr_t base_cmdline; -static phys_addr_t limit_cmdline; +static const phys_addr_t size_bytes __initconst = + (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +static phys_addr_t size_cmdline __initdata = -1; +static phys_addr_t base_cmdline __initdata; +static phys_addr_t limit_cmdline __initdata; static int __init early_cma(char *p) { -- cgit v1.2.3 From f94df9890e98f2090c6a8d70c795134863b70201 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 24 Sep 2019 16:07:45 +0100 Subject: Add wake_up_interruptible_sync_poll_locked() Add a wakeup call for a case whereby the caller already has the waitqueue spinlock held. This can be used by pipes to alter the ring buffer indices and issue a wakeup under the same spinlock. Signed-off-by: David Howells Acked-by: Peter Zijlstra (Intel) --- kernel/sched/wait.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index b4b52361dab7..ba059fbfc53a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -191,6 +191,29 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL_GPL(__wake_up_sync_key); +/** + * __wake_up_locked_sync_key - wake up a thread blocked on a locked waitqueue. + * @wq_head: the waitqueue + * @mode: which threads + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs in that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * If this function wakes up a task, it executes a full memory barrier before + * accessing the task state. + */ +void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, + unsigned int mode, void *key) +{ + __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); + /* * __wake_up_sync - see __wake_up_sync_key() */ -- cgit v1.2.3 From f973cce0e4022fa8c969ca5b0c71559125382eb2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 21 Oct 2019 22:38:29 +0200 Subject: kexec: Fix pointer-to-int-cast warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix two pointer-to-int-cast warnings when compiling for the 32-bit parisc platform: kernel/kexec_file.c: In function ‘crash_prepare_elf64_headers’: kernel/kexec_file.c:1307:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] phdr->p_vaddr = (Elf64_Addr)_text; ^ kernel/kexec_file.c:1324:19: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] phdr->p_vaddr = (unsigned long long) __va(mstart); ^ Signed-off-by: Helge Deller --- kernel/kexec_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 79f252af7dee..a2df93948665 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1304,7 +1304,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, if (kernel_map) { phdr->p_type = PT_LOAD; phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_vaddr = (unsigned long) _text; phdr->p_filesz = phdr->p_memsz = _end - _text; phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); ehdr->e_phnum++; @@ -1321,7 +1321,7 @@ int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, phdr->p_offset = mstart; phdr->p_paddr = mstart; - phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_vaddr = (unsigned long) __va(mstart); phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; phdr->p_align = 0; ehdr->e_phnum++; -- cgit v1.2.3 From 7162431dcf72032835d369c8d7b51311df407938 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 16 Oct 2019 13:33:13 +0200 Subject: ftrace: Introduce PERMANENT ftrace_ops flag Livepatch uses ftrace for redirection to new patched functions. It means that if ftrace is disabled, all live patched functions are disabled as well. Toggling global 'ftrace_enabled' sysctl thus affect it directly. It is not a problem per se, because only administrator can set sysctl values, but it still may be surprising. Introduce PERMANENT ftrace_ops flag to amend this. If the FTRACE_OPS_FL_PERMANENT is set on any ftrace ops, the tracing cannot be disabled by disabling ftrace_enabled. Equally, a callback with the flag set cannot be registered if ftrace_enabled is disabled. Link: http://lkml.kernel.org/r/20191016113316.13415-2-mbenes@suse.cz Reviewed-by: Petr Mladek Reviewed-by: Kamalesh Babulal Signed-off-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index bd43537702bd..b552cf2d85f8 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -196,7 +196,8 @@ static int klp_patch_func(struct klp_func *func) ops->fops.func = klp_ftrace_handler; ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; + FTRACE_OPS_FL_IPMODIFY | + FTRACE_OPS_FL_PERMANENT; list_add(&ops->node, &klp_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f296d89be757..89e9128652ef 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -326,6 +326,8 @@ int __register_ftrace_function(struct ftrace_ops *ops) if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) ops->flags |= FTRACE_OPS_FL_SAVE_REGS; #endif + if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) + return -EBUSY; if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; @@ -6754,6 +6756,18 @@ int unregister_ftrace_function(struct ftrace_ops *ops) } EXPORT_SYMBOL_GPL(unregister_ftrace_function); +static bool is_permanent_ops_registered(void) +{ + struct ftrace_ops *op; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PERMANENT) + return true; + } while_for_each_ftrace_op(op); + + return false; +} + int ftrace_enable_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -6771,8 +6785,6 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; - last_ftrace_enabled = !!ftrace_enabled; - if (ftrace_enabled) { /* we are starting ftrace again */ @@ -6783,12 +6795,19 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); } else { + if (is_permanent_ops_registered()) { + ftrace_enabled = true; + ret = -EBUSY; + goto out; + } + /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; ftrace_shutdown_sysctl(); } + last_ftrace_enabled = !!ftrace_enabled; out: mutex_unlock(&ftrace_lock); return ret; -- cgit v1.2.3 From 56144737e67329c9aaed15f942d46a6302e2e3d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Nov 2019 09:48:04 -0800 Subject: hrtimer: Annotate lockless access to timer->state syzbot reported various data-race caused by hrtimer_is_queued() reading timer->state. A READ_ONCE() is required there to silence the warning. Also add the corresponding WRITE_ONCE() when timer->state is set. In remove_hrtimer() the hrtimer_is_queued() helper is open coded to avoid loading timer->state twice. KCSAN reported these cases: BUG: KCSAN: data-race in __remove_hrtimer / tcp_pacing_check write to 0xffff8880b2a7d388 of 1 bytes by interrupt on cpu 0: __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991 __run_hrtimer kernel/time/hrtimer.c:1496 [inline] __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593 __do_softirq+0x115/0x33f kernel/softirq.c:292 run_ksoftirqd+0x46/0x60 kernel/softirq.c:603 smpboot_thread_fn+0x37d/0x4a0 kernel/smpboot.c:165 kthread+0x1d4/0x200 drivers/block/aoe/aoecmd.c:1253 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:352 read to 0xffff8880b2a7d388 of 1 bytes by task 24652 on cpu 1: tcp_pacing_check net/ipv4/tcp_output.c:2235 [inline] tcp_pacing_check+0xba/0x130 net/ipv4/tcp_output.c:2225 tcp_xmit_retransmit_queue+0x32c/0x5a0 net/ipv4/tcp_output.c:3044 tcp_xmit_recovery+0x7c/0x120 net/ipv4/tcp_input.c:3558 tcp_ack+0x17b6/0x3170 net/ipv4/tcp_input.c:3717 tcp_rcv_established+0x37e/0xf50 net/ipv4/tcp_input.c:5696 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2435 release_sock+0x61/0x160 net/core/sock.c:2951 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 BUG: KCSAN: data-race in __remove_hrtimer / __tcp_ack_snd_check write to 0xffff8880a3a65588 of 1 bytes by interrupt on cpu 0: __remove_hrtimer+0x52/0x130 kernel/time/hrtimer.c:991 __run_hrtimer kernel/time/hrtimer.c:1496 [inline] __hrtimer_run_queues+0x250/0x600 kernel/time/hrtimer.c:1576 hrtimer_run_softirq+0x10e/0x150 kernel/time/hrtimer.c:1593 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0xe6/0x280 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830 read to 0xffff8880a3a65588 of 1 bytes by task 22891 on cpu 1: __tcp_ack_snd_check+0x415/0x4f0 net/ipv4/tcp_input.c:5265 tcp_ack_snd_check net/ipv4/tcp_input.c:5287 [inline] tcp_rcv_established+0x750/0xf50 net/ipv4/tcp_input.c:5708 tcp_v4_do_rcv+0x381/0x4e0 net/ipv4/tcp_ipv4.c:1561 sk_backlog_rcv include/net/sock.h:945 [inline] __release_sock+0x135/0x1e0 net/core/sock.c:2435 release_sock+0x61/0x160 net/core/sock.c:2951 sk_stream_wait_memory+0x3d7/0x7c0 net/core/stream.c:145 tcp_sendmsg_locked+0xb47/0x1f30 net/ipv4/tcp.c:1393 tcp_sendmsg+0x39/0x60 net/ipv4/tcp.c:1434 inet_sendmsg+0x6d/0x90 net/ipv4/af_inet.c:807 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0x9f/0xc0 net/socket.c:657 __sys_sendto+0x21f/0x320 net/socket.c:1952 __do_sys_sendto net/socket.c:1964 [inline] __se_sys_sendto net/socket.c:1960 [inline] __x64_sys_sendto+0x89/0xb0 net/socket.c:1960 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24652 Comm: syz-executor.3 Not tainted 5.4.0-rc3+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 [ tglx: Added comments ] Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191106174804.74723-1-edumazet@google.com --- kernel/time/hrtimer.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 65605530ee34..7f31932216a1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -966,7 +966,8 @@ static int enqueue_hrtimer(struct hrtimer *timer, base->cpu_base->active_bases |= 1 << base->index; - timer->state = HRTIMER_STATE_ENQUEUED; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -988,7 +989,8 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; - timer->state = newstate; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; @@ -1013,8 +1015,9 @@ static void __remove_hrtimer(struct hrtimer *timer, static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { - if (hrtimer_is_queued(timer)) { - u8 state = timer->state; + u8 state = timer->state; + + if (state & HRTIMER_STATE_ENQUEUED) { int reprogram; /* -- cgit v1.2.3 From acaade1af3587132e7ea585f470a95261e14f60c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 09:57:09 +0100 Subject: dma-direct: remove __dma_direct_free_pages We can just call dma_free_contiguous directly instead of wrapping it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 11 +++-------- kernel/dma/remap.c | 4 ++-- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 8402b29c280f..a7a2739fb747 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -153,7 +153,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, * so log an error and fail. */ dev_info(dev, "Rejecting highmem page from CMA.\n"); - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); return NULL; } @@ -175,11 +175,6 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, return ret; } -void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) -{ - dma_free_contiguous(dev, page, size); -} - void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { @@ -188,7 +183,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && !force_dma_unencrypted(dev)) { /* cpu_addr is a struct page cookie, not a kernel address */ - __dma_direct_free_pages(dev, size, cpu_addr); + dma_free_contiguous(dev, cpu_addr, size); return; } @@ -198,7 +193,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && dma_alloc_need_uncached(dev, attrs)) cpu_addr = cached_kernel_address(cpu_addr); - __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); + dma_free_contiguous(dev, virt_to_page(cpu_addr), size); } void *dma_direct_alloc(struct device *dev, size_t size, diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index c00b9258fa6a..fb1e50c2d48a 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -238,7 +238,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, dma_pgprot(dev, PAGE_KERNEL, attrs), __builtin_return_address(0)); if (!ret) { - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); return ret; } @@ -256,7 +256,7 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, struct page *page = pfn_to_page(__phys_to_pfn(phys)); vunmap(vaddr); - __dma_direct_free_pages(dev, size, page); + dma_free_contiguous(dev, page, size); } } -- cgit v1.2.3 From 4e1003aa56a7d60ddb048e43a7a51368fcfe36af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 09:57:32 +0100 Subject: dma-direct: remove the dma_handle argument to __dma_direct_alloc_pages The argument isn't used anywhere, so stop passing it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 4 ++-- kernel/dma/remap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a7a2739fb747..724c282dd943 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -83,7 +83,7 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) } struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) + gfp_t gfp, unsigned long attrs) { size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); @@ -131,7 +131,7 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - page = __dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); + page = __dma_direct_alloc_pages(dev, size, gfp, attrs); if (!page) return NULL; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index fb1e50c2d48a..90d5ce77c189 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -226,7 +226,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, goto done; } - page = __dma_direct_alloc_pages(dev, size, dma_handle, flags, attrs); + page = __dma_direct_alloc_pages(dev, size, flags, attrs); if (!page) return NULL; -- cgit v1.2.3 From 714641c3670cdc75371a7ff5bdfd5e9a170c7ffd Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:25:46 -0500 Subject: ftrace: Separate out the copying of a ftrace_hash from __ftrace_hash_move() Most of the functionality of __ftrace_hash_move() can be reused, but not all of it. That is, __ftrace_hash_move() is used to simply make a new hash from an existing one, using the same size as the original. Creating a dup_hash(), where we can specify a new size will be useful when we want to create a hash with a default size, or simply copy the old one. Signed-off-by: Steven Rostedt (VMWare) --- kernel/trace/ftrace.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 89e9128652ef..76e5de8c7822 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1372,23 +1372,15 @@ ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash); static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, struct ftrace_hash *new_hash); -static struct ftrace_hash * -__ftrace_hash_move(struct ftrace_hash *src) +static struct ftrace_hash *dup_hash(struct ftrace_hash *src, int size) { struct ftrace_func_entry *entry; - struct hlist_node *tn; - struct hlist_head *hhd; struct ftrace_hash *new_hash; - int size = src->count; + struct hlist_head *hhd; + struct hlist_node *tn; int bits = 0; int i; - /* - * If the new source is empty, just return the empty_hash. - */ - if (ftrace_hash_empty(src)) - return EMPTY_HASH; - /* * Make the hash size about 1/2 the # found */ @@ -1413,10 +1405,23 @@ __ftrace_hash_move(struct ftrace_hash *src) __add_hash_entry(new_hash, entry); } } - return new_hash; } +static struct ftrace_hash * +__ftrace_hash_move(struct ftrace_hash *src) +{ + int size = src->count; + + /* + * If the new source is empty, just return the empty_hash. + */ + if (ftrace_hash_empty(src)) + return EMPTY_HASH; + + return dup_hash(src, size); +} + static int ftrace_hash_move(struct ftrace_ops *ops, int enable, struct ftrace_hash **dst, struct ftrace_hash *src) -- cgit v1.2.3 From 7e16f581a81759bafea04d049134b32d1a881226 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 12:26:46 -0500 Subject: ftrace: Separate out functionality from ftrace_location_range() Create a new function called lookup_rec() from the functionality of ftrace_location_range(). The difference between lookup_rec() is that it returns the record that it finds, where as ftrace_location_range() returns only if it found a match or not. The lookup_rec() is static, and can be used for new functionality where ftrace needs to find a record of a specific address. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 76e5de8c7822..b0e7f03919de 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1541,6 +1541,26 @@ static int ftrace_cmp_recs(const void *a, const void *b) return 0; } +static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec = NULL; + struct dyn_ftrace key; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + } + return rec; +} + /** * ftrace_location_range - return the first address of a traced location * if it touches the given ip range @@ -1555,23 +1575,11 @@ static int ftrace_cmp_recs(const void *a, const void *b) */ unsigned long ftrace_location_range(unsigned long start, unsigned long end) { - struct ftrace_page *pg; struct dyn_ftrace *rec; - struct dyn_ftrace key; - key.ip = start; - key.flags = end; /* overload flags, as it is unsigned long */ - - for (pg = ftrace_pages_start; pg; pg = pg->next) { - if (end < pg->records[0].ip || - start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) - continue; - rec = bsearch(&key, pg->records, pg->index, - sizeof(struct dyn_ftrace), - ftrace_cmp_recs); - if (rec) - return rec->ip; - } + rec = lookup_rec(start, end); + if (rec) + return rec->ip; return 0; } -- cgit v1.2.3 From 153bedbac2ebd475e1c7c2d2fa0c042f5525927d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:55 +0100 Subject: irq_work: Convert flags to atomic_t We need to convert flags to atomic_t in order to later fix an ordering issue on atomic_cmpxchg() failure. This will allow us to use atomic_fetch_or(). Also clarify the nature of those flags. [ mingo: Converted two more usage site the original patch missed. ] Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/bpf/stackmap.c | 2 +- kernel/irq_work.c | 18 +++++++++--------- kernel/printk/printk.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 052580c33d26..4d31284095e2 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -289,7 +289,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (in_nmi()) { work = this_cpu_ptr(&up_read_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index d42acaf81886..df0dbf4d859b 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,16 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - unsigned long flags, oflags, nflags; + int flags, oflags, nflags; /* * Start with our best wish as a premise but only trust any * flag value after cmpxchg() result. */ - flags = work->flags & ~IRQ_WORK_PENDING; + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; for (;;) { nflags = flags | IRQ_WORK_CLAIMED; - oflags = cmpxchg(&work->flags, flags, nflags); + oflags = atomic_cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; if (oflags & IRQ_WORK_PENDING) @@ -61,7 +61,7 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (work->flags & IRQ_WORK_LAZY) { + if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); @@ -143,7 +143,7 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - unsigned long flags; + int flags; BUG_ON(!irqs_disabled()); @@ -159,15 +159,15 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = work->flags & ~IRQ_WORK_PENDING; - xchg(&work->flags, flags); + flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; + atomic_xchg(&work->flags, flags); work->func(work); /* * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ - (void)cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } @@ -199,7 +199,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (work->flags & IRQ_WORK_BUSY) + while (atomic_read(&work->flags) & IRQ_WORK_BUSY) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ca65327a6de8..865727373a3b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2961,7 +2961,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, + .flags = ATOMIC_INIT(IRQ_WORK_LAZY), }; void wake_up_klogd(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 44bd08f2443b..ff467a4e2639 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -660,7 +660,7 @@ BPF_CALL_1(bpf_send_signal, u32, sig) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (work->irq_work.flags & IRQ_WORK_BUSY) + if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) return -EBUSY; /* Add the current task, which is the target of sending signal, -- cgit v1.2.3 From 25269871db1ad0cbbaafd5098cbdb40c8db4ccb9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:56 +0100 Subject: irq_work: Fix irq_work_claim() memory ordering When irq_work_claim() finds IRQ_WORK_PENDING flag already set, we just return and don't raise a new IPI. We expect the destination to see and handle our latest updades thanks to the pairing atomic_xchg() in irq_work_run_list(). But cmpxchg() doesn't guarantee a full memory barrier upon failure. So it's possible that the destination misses our latest updates. So use atomic_fetch_or() instead that is unconditionally fully ordered and also performs exactly what we want here and simplify the code. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index df0dbf4d859b..255454a48346 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -29,24 +29,16 @@ static DEFINE_PER_CPU(struct llist_head, lazy_list); */ static bool irq_work_claim(struct irq_work *work) { - int flags, oflags, nflags; + int oflags; + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* - * Start with our best wish as a premise but only trust any - * flag value after cmpxchg() result. + * If the work is already pending, no need to raise the IPI. + * The pairing atomic_xchg() in irq_work_run() makes sure + * everything we did before is visible. */ - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; - for (;;) { - nflags = flags | IRQ_WORK_CLAIMED; - oflags = atomic_cmpxchg(&work->flags, flags, nflags); - if (oflags == flags) - break; - if (oflags & IRQ_WORK_PENDING) - return false; - flags = oflags; - cpu_relax(); - } - + if (oflags & IRQ_WORK_PENDING) + return false; return true; } -- cgit v1.2.3 From feb4a51323babe13315c3b783ea7f1cf25368918 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Nov 2019 17:08:57 +0100 Subject: irq_work: Slightly simplify IRQ_WORK_PENDING clearing Instead of fetching the value of flags and perform an xchg() to clear a bit, just use atomic_fetch_andnot() that is more suitable to do that job in one operation while keeping the full ordering. Signed-off-by: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul E . McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108160858.31665-4-frederic@kernel.org Signed-off-by: Ingo Molnar --- kernel/irq_work.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 255454a48346..49c53f80a13a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work) oflags = atomic_fetch_or(IRQ_WORK_CLAIMED, &work->flags); /* * If the work is already pending, no need to raise the IPI. - * The pairing atomic_xchg() in irq_work_run() makes sure + * The pairing atomic_fetch_andnot() in irq_work_run() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) @@ -135,7 +135,6 @@ static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; - int flags; BUG_ON(!irqs_disabled()); @@ -144,6 +143,7 @@ static void irq_work_run_list(struct llist_head *list) llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, llnode) { + int flags; /* * Clear the PENDING bit, after this point the @work * can be re-used. @@ -151,8 +151,7 @@ static void irq_work_run_list(struct llist_head *list) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = atomic_read(&work->flags) & ~IRQ_WORK_PENDING; - atomic_xchg(&work->flags, flags); + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); work->func(work); /* -- cgit v1.2.3 From 34dc0ea6bc960f1f57b2148f01a3f4da23f87013 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 11:01:37 +0100 Subject: dma-direct: provide mmap and get_sgtable method overrides For dma-direct we know that the DMA address is an encoding of the physical address that we can trivially decode. Use that fact to provide implementations that do not need the arch_dma_coherent_to_pfn architecture hook. Note that we still can only support mmap of non-coherent memory only if the architecture provides a way to set an uncached bit in the page tables. This must be true for architectures that use the generic remap helpers, but other architectures can also manually select it. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/Kconfig | 12 ++++++++--- kernel/dma/direct.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 45 +++++++-------------------------------- kernel/dma/remap.c | 6 ------ 4 files changed, 75 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 73c5c2b8e824..4c103a24e380 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -51,9 +51,6 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL config ARCH_HAS_DMA_PREP_COHERENT bool -config ARCH_HAS_DMA_COHERENT_TO_PFN - bool - config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool @@ -68,9 +65,18 @@ config SWIOTLB bool select NEED_DMA_MAP_STATE +# +# Should be selected if we can mmap non-coherent mappings to userspace. +# The only thing that is really required is a way to set an uncached bit +# in the pagetables +# +config DMA_NONCOHERENT_MMAP + bool + config DMA_REMAP depends on MMU select GENERIC_ALLOCATOR + select DMA_NONCOHERENT_MMAP bool config DMA_DIRECT_REMAP diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 724c282dd943..58beaa9ddd27 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,6 +43,12 @@ static inline dma_addr_t phys_to_dma_direct(struct device *dev, return phys_to_dma(dev, phys); } +static inline struct page *dma_direct_to_page(struct device *dev, + dma_addr_t dma_addr) +{ + return pfn_to_page(PHYS_PFN(dma_to_phys(dev, dma_addr))); +} + u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); @@ -379,6 +385,59 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, } EXPORT_SYMBOL(dma_direct_map_resource); +int dma_direct_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + struct page *page = dma_direct_to_page(dev, dma_addr); + int ret; + + ret = sg_alloc_table(sgt, 1, GFP_KERNEL); + if (!ret) + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); + return ret; +} + +#ifdef CONFIG_MMU +bool dma_direct_can_mmap(struct device *dev) +{ + return dev_is_dma_coherent(dev) || + IS_ENABLED(CONFIG_DMA_NONCOHERENT_MMAP); +} + +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + unsigned long user_count = vma_pages(vma); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + unsigned long pfn = PHYS_PFN(dma_to_phys(dev, dma_addr)); + int ret = -ENXIO; + + vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + + if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + if (vma->vm_pgoff >= count || user_count > count - vma->vm_pgoff) + return -ENXIO; + return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + user_count << PAGE_SHIFT, vma->vm_page_prot); +} +#else /* CONFIG_MMU */ +bool dma_direct_can_mmap(struct device *dev) +{ + return false; +} + +int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + return -ENXIO; +} +#endif /* CONFIG_MMU */ + /* * Because 32-bit DMA masks are so common we expect every architecture to be * able to satisfy them - either by not supporting more physical memory, or by diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index d9334f31a5af..12ff766ec1fa 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -112,24 +112,9 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { - struct page *page; + struct page *page = virt_to_page(cpu_addr); int ret; - if (!dev_is_dma_coherent(dev)) { - unsigned long pfn; - - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) - return -ENXIO; - - /* If the PFN is not valid, we do not have a struct page */ - pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); - if (!pfn_valid(pfn)) - return -ENXIO; - page = pfn_to_page(pfn); - } else { - page = virt_to_page(cpu_addr); - } - ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); @@ -154,7 +139,7 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_is_direct(ops)) - return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, + return dma_direct_get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); if (!ops->get_sgtable) return -ENXIO; @@ -192,7 +177,6 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long user_count = vma_pages(vma); unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; - unsigned long pfn; int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); @@ -203,19 +187,8 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, if (off >= count || user_count > count - off) return -ENXIO; - if (!dev_is_dma_coherent(dev)) { - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)) - return -ENXIO; - - /* If the PFN is not valid, we do not have a struct page */ - pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr); - if (!pfn_valid(pfn)) - return -ENXIO; - } else { - pfn = page_to_pfn(virt_to_page(cpu_addr)); - } - - return remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, + return remap_pfn_range(vma, vma->vm_start, + page_to_pfn(virt_to_page(cpu_addr)) + vma->vm_pgoff, user_count << PAGE_SHIFT, vma->vm_page_prot); #else return -ENXIO; @@ -233,12 +206,8 @@ bool dma_can_mmap(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); - if (dma_is_direct(ops)) { - return IS_ENABLED(CONFIG_MMU) && - (dev_is_dma_coherent(dev) || - IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN)); - } - + if (dma_is_direct(ops)) + return dma_direct_can_mmap(dev); return ops->mmap != NULL; } EXPORT_SYMBOL_GPL(dma_can_mmap); @@ -263,7 +232,7 @@ int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, const struct dma_map_ops *ops = get_dma_ops(dev); if (dma_is_direct(ops)) - return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size, + return dma_direct_mmap(dev, vma, cpu_addr, dma_addr, size, attrs); if (!ops->mmap) return -ENXIO; diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 90d5ce77c189..3c49499ee6b0 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -259,10 +259,4 @@ void arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_free_contiguous(dev, page, size); } } - -long arch_dma_coherent_to_pfn(struct device *dev, void *cpu_addr, - dma_addr_t dma_addr) -{ - return __phys_to_pfn(dma_to_phys(dev, dma_addr)); -} #endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit v1.2.3 From 3acac065508f6cc60ac9d3e4b7c6cc37fd91d531 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Oct 2019 11:06:32 +0100 Subject: dma-mapping: merge the generic remapping helpers into dma-direct Integrate the generic dma remapping implementation into the main flow. This prepares for architectures like xtensa that use an uncached segment for pages in the kernel mapping, but can also remap highmem from CMA. To simplify that implementation we now always deduct the page from the physical address via the DMA address instead of the virtual address. Signed-off-by: Christoph Hellwig Reviewed-by: Max Filippov --- kernel/dma/direct.c | 60 ++++++++++++++++++++++++++++++++++++++++++----------- kernel/dma/remap.c | 49 ------------------------------------------- 2 files changed, 48 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 58beaa9ddd27..22a2e0833862 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -137,6 +138,15 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs) && + !gfpflags_allow_blocking(gfp)) { + ret = dma_alloc_from_pool(PAGE_ALIGN(size), &page, gfp); + if (!ret) + return NULL; + goto done; + } + page = __dma_direct_alloc_pages(dev, size, gfp, attrs); if (!page) return NULL; @@ -146,9 +156,28 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, /* remove any dirty cache lines on the kernel alias */ if (!PageHighMem(page)) arch_dma_prep_coherent(page, size); - *dma_handle = phys_to_dma(dev, page_to_phys(page)); /* return the page pointer as the opaque cookie */ - return page; + ret = page; + goto done; + } + + if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_alloc_need_uncached(dev, attrs)) || + (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { + /* remove any dirty cache lines on the kernel alias */ + arch_dma_prep_coherent(page, PAGE_ALIGN(size)); + + /* create a coherent mapping */ + ret = dma_common_contiguous_remap(page, PAGE_ALIGN(size), + dma_pgprot(dev, PAGE_KERNEL, attrs), + __builtin_return_address(0)); + if (!ret) { + dma_free_contiguous(dev, page, size); + return ret; + } + + memset(ret, 0, size); + goto done; } if (PageHighMem(page)) { @@ -164,12 +193,9 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { + if (force_dma_unencrypted(dev)) set_memory_decrypted((unsigned long)ret, 1 << get_order(size)); - *dma_handle = __phys_to_dma(dev, page_to_phys(page)); - } else { - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - } + memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && @@ -177,7 +203,11 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, arch_dma_prep_coherent(page, size); ret = uncached_kernel_address(ret); } - +done: + if (force_dma_unencrypted(dev)) + *dma_handle = __phys_to_dma(dev, page_to_phys(page)); + else + *dma_handle = phys_to_dma(dev, page_to_phys(page)); return ret; } @@ -193,19 +223,24 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, return; } + if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + dma_free_from_pool(cpu_addr, PAGE_ALIGN(size))) + return; + if (force_dma_unencrypted(dev)) set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && - dma_alloc_need_uncached(dev, attrs)) - cpu_addr = cached_kernel_address(cpu_addr); - dma_free_contiguous(dev, virt_to_page(cpu_addr), size); + if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) + vunmap(cpu_addr); + + dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size); } void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); @@ -215,6 +250,7 @@ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && dma_alloc_need_uncached(dev, attrs)) arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); else diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 3c49499ee6b0..d47bd40fc0f5 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -210,53 +210,4 @@ bool dma_free_from_pool(void *start, size_t size) gen_pool_free(atomic_pool, (unsigned long)start, size); return true; } - -void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t flags, unsigned long attrs) -{ - struct page *page = NULL; - void *ret; - - size = PAGE_ALIGN(size); - - if (!gfpflags_allow_blocking(flags)) { - ret = dma_alloc_from_pool(size, &page, flags); - if (!ret) - return NULL; - goto done; - } - - page = __dma_direct_alloc_pages(dev, size, flags, attrs); - if (!page) - return NULL; - - /* remove any dirty cache lines on the kernel alias */ - arch_dma_prep_coherent(page, size); - - /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, - dma_pgprot(dev, PAGE_KERNEL, attrs), - __builtin_return_address(0)); - if (!ret) { - dma_free_contiguous(dev, page, size); - return ret; - } - - memset(ret, 0, size); -done: - *dma_handle = phys_to_dma(dev, page_to_phys(page)); - return ret; -} - -void arch_dma_free(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_handle, unsigned long attrs) -{ - if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { - phys_addr_t phys = dma_to_phys(dev, dma_handle); - struct page *page = pfn_to_page(__phys_to_pfn(phys)); - - vunmap(vaddr); - dma_free_contiguous(dev, page, size); - } -} #endif /* CONFIG_DMA_DIRECT_REMAP */ -- cgit v1.2.3 From d3694f30732fd2a334b93f087033c5a5836f7aba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2019 12:32:04 -0700 Subject: dma-debug: reorder struct dma_debug_entry fields Move all fields used during exact match lookups to the first cache line. This makes debug_dma_mapping_error() and friends about 50% faster. Since it removes two 32bit holes, force a cacheline alignment on struct dma_debug_entry. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 4ad74f5987ea..a5b85dabfb8c 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -54,33 +54,33 @@ enum map_err_types { * struct dma_debug_entry - track a dma_map* or dma_alloc_coherent mapping * @list: node on pre-allocated free_entries list * @dev: 'dev' argument to dma_map_{page|single|sg} or dma_alloc_coherent - * @type: single, page, sg, coherent - * @pfn: page frame of the start address - * @offset: offset of mapping relative to pfn * @size: length of the mapping + * @type: single, page, sg, coherent * @direction: enum dma_data_direction * @sg_call_ents: 'nents' from dma_map_sg * @sg_mapped_ents: 'mapped_ents' from dma_map_sg + * @pfn: page frame of the start address + * @offset: offset of mapping relative to pfn * @map_err_type: track whether dma_mapping_error() was checked * @stacktrace: support backtraces when a violation is detected */ struct dma_debug_entry { struct list_head list; struct device *dev; - int type; - unsigned long pfn; - size_t offset; u64 dev_addr; u64 size; + int type; int direction; int sg_call_ents; int sg_mapped_ents; + unsigned long pfn; + size_t offset; enum map_err_types map_err_type; #ifdef CONFIG_STACKTRACE unsigned int stack_len; unsigned long stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; #endif -}; +} ____cacheline_aligned_in_smp; typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); -- cgit v1.2.3 From 5e76f564572b85735de4b75a5e73b514be2562be Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Oct 2019 11:48:44 -0700 Subject: dma-debug: increase HASH_SIZE With modern NIC, it is not unusual having about ~256,000 active dma mappings and a hash size of 1024 buckets is too small. Forcing full cache line per bucket does not seem useful, especially now that we have contention on free_entries_lock for allocations and freeing of entries. Better use the space to fit more buckets. Signed-off-by: Eric Dumazet Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a5b85dabfb8c..004496654aaa 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -27,7 +27,7 @@ #include -#define HASH_SIZE 1024ULL +#define HASH_SIZE 16384ULL #define HASH_FN_SHIFT 13 #define HASH_FN_MASK (HASH_SIZE - 1) @@ -87,7 +87,7 @@ typedef bool (*match_fn)(struct dma_debug_entry *, struct dma_debug_entry *); struct hash_bucket { struct list_head list; spinlock_t lock; -} ____cacheline_aligned_in_smp; +}; /* Hash list to save the allocated dma addresses */ static struct hash_bucket dma_entry_hash[HASH_SIZE]; -- cgit v1.2.3 From 9a066357184485784f782719093ff804d05b85db Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:52 +0900 Subject: kheaders: remove unneeded 'cat' command piped to 'head' / 'tail' The 'head' and 'tail' commands can take a file path directly. So, you do not need to run 'cat'. cat kernel/kheaders.md5 | head -1 ... is equivalent to: head -1 kernel/kheaders.md5 and the latter saves forking one process. While I was here, I replaced 'head -1' with 'head -n 1'. I also replaced '==' with '=' since we do not have a good reason to use the bashism. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 5a0fc0b0403a..b8054b0d5010 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -41,10 +41,10 @@ obj_files_md5="$(find $dir_list -name "*.h" | this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && - [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && - [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && - [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then + [ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] && + [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] && + [ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && + [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then exit fi -- cgit v1.2.3 From 0e11773e76098729552b750ccff79374d1e62002 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:53 +0900 Subject: kheaders: optimize md5sum calculation for in-tree builds This script computes md5sum of headers in srctree and in objtree. However, when we are building in-tree, we know the srctree and the objtree are the same. That is, we end up with the same computation twice. In fact, the first two lines of kernel/kheaders.md5 are always the same for in-tree builds. Unify the two md5sum calculations. For in-tree builds ($building_out_of_srctree is empty), we check only two directories, "include", and "arch/$SRCARCH/include". For out-of-tree builds ($building_out_of_srctree is 1), we check 4 directories, "$srctree/include", "$srctree/arch/$SRCARCH/include", "include", and "arch/$SRCARCH/include" since we know they are all different. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index b8054b0d5010..6ff86e62787f 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -21,29 +21,30 @@ arch/$SRCARCH/include/ # Uncomment it for debugging. # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi -# find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter -# find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter +# find $all_dirs -name "*.h" | xargs ls -l > /tmp/ls-$iter + +all_dirs= +if [ "$building_out_of_srctree" ]; then + for d in $dir_list; do + all_dirs="$all_dirs $srctree/$d" + done +fi +all_dirs="$all_dirs $dir_list" # include/generated/compile.h is ignored because it is touched even when none # of the source files changed. This causes pointless regeneration, so let us # ignore them for md5 calculation. -pushd $srctree > /dev/null -src_files_md5="$(find $dir_list -name "*.h" | - grep -v "include/generated/compile.h" | - grep -v "include/generated/autoconf.h" | - xargs ls -l | md5sum | cut -d ' ' -f1)" -popd > /dev/null -obj_files_md5="$(find $dir_list -name "*.h" | - grep -v "include/generated/compile.h" | - grep -v "include/generated/autoconf.h" | +headers_md5="$(find $all_dirs -name "*.h" | + grep -v "include/generated/compile.h" | + grep -v "include/generated/autoconf.h" | xargs ls -l | md5sum | cut -d ' ' -f1)" + # Any changes to this script will also cause a rebuild of the archive. this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi if [ -f kernel/kheaders.md5 ] && - [ "$(head -n 1 kernel/kheaders.md5)" = "$src_files_md5" ] && - [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$obj_files_md5" ] && - [ "$(head -n 3 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && + [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] && + [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] && [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then exit fi @@ -79,8 +80,7 @@ find $cpio_dir -printf "./%P\n" | LC_ALL=C sort | \ --owner=0 --group=0 --numeric-owner --no-recursion \ -Jcf $tarfile -C $cpio_dir/ -T - > /dev/null -echo "$src_files_md5" > kernel/kheaders.md5 -echo "$obj_files_md5" >> kernel/kheaders.md5 +echo $headers_md5 > kernel/kheaders.md5 echo "$this_file_md5" >> kernel/kheaders.md5 echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 -- cgit v1.2.3 From ea79e5168be644fdaf7d4e6a73eceaf07b3da76a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:54 +0900 Subject: kheaders: optimize header copy for in-tree builds This script copies headers by the cpio command twice; first from srctree, and then from objtree. However, when we building in-tree, we know the srctree and the objtree are the same. That is, all the headers copied by the first cpio are overwritten by the second one. Skip the first cpio when we are building in-tree. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 6ff86e62787f..0f7752dd93a6 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -56,14 +56,16 @@ fi rm -rf $cpio_dir mkdir $cpio_dir -pushd $srctree > /dev/null -for f in $dir_list; - do find "$f" -name "*.h"; -done | cpio --quiet -pd $cpio_dir -popd > /dev/null +if [ "$building_out_of_srctree" ]; then + pushd $srctree > /dev/null + for f in $dir_list + do find "$f" -name "*.h"; + done | cpio --quiet -pd $cpio_dir + popd > /dev/null +fi -# The second CPIO can complain if files already exist which can -# happen with out of tree builds. Just silence CPIO for now. +# The second CPIO can complain if files already exist which can happen with out +# of tree builds having stale headers in srctree. Just silence CPIO for now. for f in $dir_list; do find "$f" -name "*.h"; done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 -- cgit v1.2.3 From 1463f74f492eea7191f0178e01f3d38371a48210 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:55 +0900 Subject: kheaders: remove the last bashism to allow sh to run it 'pushd' ... 'popd' is the last bash-specific code in this script. One way to avoid it is to run the code in a sub-shell. With that addressed, you can run this script with sh. I replaced $(BASH) with $(CONFIG_SHELL), and I changed the hashbang to #!/bin/sh. Signed-off-by: Masahiro Yamada --- kernel/Makefile | 2 +- kernel/gen_kheaders.sh | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index daad787fb795..42557f251fea 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -128,7 +128,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz - cmd_genikh = $(BASH) $(srctree)/kernel/gen_kheaders.sh $@ + cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ $(obj)/kheaders_data.tar.xz: FORCE $(call cmd,genikh) diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 0f7752dd93a6..dc5744b93f8c 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/sh # SPDX-License-Identifier: GPL-2.0 # This script generates an archive consisting of kernel headers @@ -57,11 +57,12 @@ rm -rf $cpio_dir mkdir $cpio_dir if [ "$building_out_of_srctree" ]; then - pushd $srctree > /dev/null - for f in $dir_list - do find "$f" -name "*.h"; - done | cpio --quiet -pd $cpio_dir - popd > /dev/null + ( + cd $srctree + for f in $dir_list + do find "$f" -name "*.h"; + done | cpio --quiet -pd $cpio_dir + ) fi # The second CPIO can complain if files already exist which can happen with out -- cgit v1.2.3 From f276031b4e2f4c961ed6d8a42f0f0124ccac2e09 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 8 Oct 2019 21:05:56 +0900 Subject: kheaders: explain why include/config/autoconf.h is excluded from md5sum This comment block explains why include/generated/compile.h is omitted, but nothing about include/generated/autoconf.h, which might be more difficult to understand. Add more comments. Signed-off-by: Masahiro Yamada --- kernel/gen_kheaders.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index dc5744b93f8c..e13ca842eb7e 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -32,8 +32,15 @@ fi all_dirs="$all_dirs $dir_list" # include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. This causes pointless regeneration, so let us -# ignore them for md5 calculation. +# of the source files changed. +# +# When Kconfig regenerates include/generated/autoconf.h, its timestamp is +# updated, but the contents might be still the same. When any CONFIG option is +# changed, Kconfig touches the corresponding timestamp file include/config/*.h. +# Hence, the md5sum detects the configuration change anyway. We do not need to +# check include/generated/autoconf.h explicitly. +# +# Ignore them for md5 calculation to avoid pointless regeneration. headers_md5="$(find $all_dirs -name "*.h" | grep -v "include/generated/compile.h" | grep -v "include/generated/autoconf.h" | -- cgit v1.2.3 From c1d51f684c72b5eb2aecbbd47be3a2977a2dc903 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 7 Nov 2019 15:25:12 +0100 Subject: cpuidle: Use nanoseconds as the unit of time Currently, the cpuidle subsystem uses microseconds as the unit of time which (among other things) causes the idle loop to incur some integer division overhead for no clear benefit. In order to allow cpuidle to measure time in nanoseconds, add two new fields, exit_latency_ns and target_residency_ns, to represent the exit latency and target residency of an idle state in nanoseconds, respectively, to struct cpuidle_state and initialize them with the help of the corresponding values in microseconds provided by drivers. Additionally, change cpuidle_governor_latency_req() to return the idle state exit latency constraint in nanoseconds. Also meeasure idle state residency (last_residency_ns in struct cpuidle_device and time_ns in struct cpuidle_driver) in nanoseconds and update the cpuidle core and governors accordingly. However, the menu governor still computes typical intervals in microseconds to avoid integer overflows. Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Doug Smythies Tested-by: Doug Smythies --- kernel/sched/idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8dad5aa600ea..1aa260702b38 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -104,7 +104,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, * update no idle residency and return. */ if (current_clr_polling_and_test()) { - dev->last_residency = 0; + dev->last_residency_ns = 0; local_irq_enable(); return -EBUSY; } -- cgit v1.2.3 From 20d087368d38c7350a4519a3b316ef7eb2504692 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Nov 2019 21:34:25 +0100 Subject: time: Optimize ns_to_timespec64() ns_to_timespec64() calls div_s64_rem(), which is a rather slow function on 32-bit architectures, as it cannot take advantage of the do_div() optimizations for constant arguments. Open-code the div_s64_rem() function in ns_to_timespec64(), so a constant divider can be passed into the optimized div_u64_rem() function. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20191108203435.112759-3-arnd@arndb.de --- kernel/time/time.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..45a358953f09 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -550,18 +550,21 @@ EXPORT_SYMBOL(set_normalized_timespec64); */ struct timespec64 ns_to_timespec64(const s64 nsec) { - struct timespec64 ts; + struct timespec64 ts = { 0, 0 }; s32 rem; - if (!nsec) - return (struct timespec64) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; + if (likely(nsec > 0)) { + ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + ts.tv_nsec = rem; + } else if (nsec < 0) { + /* + * With negative times, tv_sec points to the earlier + * second, and tv_nsec counts the nanoseconds since + * then, so tv_nsec is always a positive number. + */ + ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1; + ts.tv_nsec = NSEC_PER_SEC - rem - 1; } - ts.tv_nsec = rem; return ts; } -- cgit v1.2.3 From 1d6acc18fee71a0db6e4fbbfbdb247e0bd5b0655 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 15 Oct 2019 13:03:39 +0530 Subject: time: Fix spelling mistake in comment witin => within Signed-off-by: Mukesh Ojha Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1571124819-9639-1-git-send-email-mojha@codeaurora.org --- kernel/time/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 45a358953f09..ea6e7e47cc37 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -179,7 +179,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return error; if (tz) { - /* Verify we're witin the +-15 hrs range */ + /* Verify we're within the +-15 hrs range */ if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) return -EINVAL; -- cgit v1.2.3 From cf25e24db61cc9df42c47485a2ec2bff4e9a3692 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Nov 2019 11:07:58 +0100 Subject: time: Rename tsk->real_start_time to ->start_boottime Since it stores CLOCK_BOOTTIME, not, as the name suggests, CLOCK_REALTIME, let's rename ->real_start_time to ->start_bootime. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..1392ee8f4848 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2130,7 +2130,7 @@ static __latent_entropy struct task_struct *copy_process( */ p->start_time = ktime_get_ns(); - p->real_start_time = ktime_get_boottime_ns(); + p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. -- cgit v1.2.3 From 763e34e74bb7d5c316015e2e39fcc8520bfd071c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:07:06 -0500 Subject: ftrace: Add register_ftrace_direct() Add the start of the functionality to allow other trampolines to use the ftrace mcount/fentry/nop location. This adds two new functions: register_ftrace_direct() and unregister_ftrace_direct() Both take two parameters: the first is the instruction address of where the mcount/fentry/nop exists, and the second is the trampoline to have that location called. This will handle cases where ftrace is already used on that same location, and will make it still work, where the registered direct called trampoline will get called after all the registered ftrace callers are handled. Currently, it will not allow for IP_MODIFY functions to be called at the same locations, which include some kprobes and live kernel patching. At this point, no architecture supports this. This is only the start of implementing the framework. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++ kernel/trace/ftrace.c | 269 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 272 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..624a05e99b0b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -33,6 +33,9 @@ config HAVE_DYNAMIC_FTRACE config HAVE_DYNAMIC_FTRACE_WITH_REGS bool +config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bool + config HAVE_FTRACE_MCOUNT_RECORD bool help @@ -557,6 +560,11 @@ config DYNAMIC_FTRACE_WITH_REGS depends on DYNAMIC_FTRACE depends on HAVE_DYNAMIC_FTRACE_WITH_REGS +config DYNAMIC_FTRACE_WITH_DIRECT_CALLS + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + config FUNCTION_PROFILER bool "Kernel function profiler" depends on FUNCTION_TRACER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b0e7f03919de..329a3f3789a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1023,6 +1023,7 @@ static bool update_all_ops; struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; + unsigned long direct; /* for direct lookup only */ }; struct ftrace_func_probe { @@ -1730,6 +1731,9 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) return false; + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT; + /* * If there's only a single callback registered to a * function, and the ops has a trampoline registered @@ -1757,6 +1761,15 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, return false; rec->flags--; + /* + * Only the internal direct_ops should have the + * DIRECT flag set. Thus, if it is removing a + * function, then that function should no longer + * be direct. + */ + if (ops->flags & FTRACE_OPS_FL_DIRECT) + rec->flags &= ~FTRACE_FL_DIRECT; + /* * If the rec had REGS enabled and the ops that is * being removed had REGS set, then see if there is @@ -2092,15 +2105,34 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * If enabling and the REGS flag does not match the REGS_EN, or * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore * this record. Set flags to fail the compare against ENABLED. + * Same for direct calls. */ if (flag) { - if (!(rec->flags & FTRACE_FL_REGS) != + if (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)) flag |= FTRACE_FL_REGS; - if (!(rec->flags & FTRACE_FL_TRAMP) != + if (!(rec->flags & FTRACE_FL_TRAMP) != !(rec->flags & FTRACE_FL_TRAMP_EN)) flag |= FTRACE_FL_TRAMP; + + /* + * Direct calls are special, as count matters. + * We must test the record for direct, if the + * DIRECT and DIRECT_EN do not match, but only + * if the count is 1. That's because, if the + * count is something other than one, we do not + * want the direct enabled (it will be done via the + * direct helper). But if DIRECT_EN is set, and + * the count is not one, we need to clear it. + */ + if (ftrace_rec_count(rec) == 1) { + if (!(rec->flags & FTRACE_FL_DIRECT) != + !(rec->flags & FTRACE_FL_DIRECT_EN)) + flag |= FTRACE_FL_DIRECT; + } else if (rec->flags & FTRACE_FL_DIRECT_EN) { + flag |= FTRACE_FL_DIRECT; + } } /* If the state of this record hasn't changed, then do nothing */ @@ -2125,6 +2157,25 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { + /* + * If there's only one user (direct_ops helper) + * then we can call the direct function + * directly (no ftrace trampoline). + */ + if (ftrace_rec_count(rec) == 1) { + if (rec->flags & FTRACE_FL_DIRECT) + rec->flags |= FTRACE_FL_DIRECT_EN; + else + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } else { + /* + * Can only call directly if there's + * only one callback to the function. + */ + rec->flags &= ~FTRACE_FL_DIRECT_EN; + } + } } /* @@ -2154,7 +2205,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) * and REGS states. The _EN flags must be disabled though. */ rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | - FTRACE_FL_REGS_EN); + FTRACE_FL_REGS_EN | FTRACE_FL_DIRECT_EN); } ftrace_bug_type = FTRACE_BUG_NOP; @@ -2309,6 +2360,51 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) return NULL; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/* Protected by rcu_tasks for reading, and direct_mutex for writing */ +static struct ftrace_hash *direct_functions = EMPTY_HASH; +static DEFINE_MUTEX(direct_mutex); + +/* + * Search the direct_functions hash to see if the given instruction pointer + * has a direct caller attached to it. + */ +static unsigned long find_rec_direct(unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) + return 0; + + return entry->direct; +} + +static void call_direct_funcs(unsigned long ip, unsigned long pip, + struct ftrace_ops *ops, struct pt_regs *regs) +{ + unsigned long addr; + + addr = find_rec_direct(ip); + if (!addr) + return; + + arch_ftrace_set_direct_caller(regs, addr); +} + +struct ftrace_ops direct_ops = { + .func = call_direct_funcs, + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS + | FTRACE_OPS_FL_PERMANENT, +}; +#else +static inline unsigned long find_rec_direct(unsigned long ip) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_get_addr_new - Get the call address to set to * @rec: The ftrace record descriptor @@ -2322,6 +2418,15 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + if ((rec->flags & FTRACE_FL_DIRECT) && + (ftrace_rec_count(rec) == 1)) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP) { @@ -2354,6 +2459,15 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { struct ftrace_ops *ops; + unsigned long addr; + + /* Direct calls take precedence over trampolines */ + if (rec->flags & FTRACE_FL_DIRECT_EN) { + addr = find_rec_direct(rec->ip); + if (addr) + return addr; + WARN_ON_ONCE(1); + } /* Trampolines take precedence over regs */ if (rec->flags & FTRACE_FL_TRAMP_EN) { @@ -3465,10 +3579,11 @@ static int t_show(struct seq_file *m, void *v) if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; - seq_printf(m, " (%ld)%s%s", + seq_printf(m, " (%ld)%s%s%s", ftrace_rec_count(rec), rec->flags & FTRACE_FL_REGS ? " R" : " ", - rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); + rec->flags & FTRACE_FL_IPMODIFY ? " I" : " ", + rec->flags & FTRACE_FL_DIRECT ? " D" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); if (ops) { @@ -3484,6 +3599,13 @@ static int t_show(struct seq_file *m, void *v) } else { add_trampoline_func(m, NULL, rec); } + if (rec->flags & FTRACE_FL_DIRECT) { + unsigned long direct; + + direct = find_rec_direct(rec->ip); + if (direct) + seq_printf(m, "\n\tdirect-->%pS", (void *)direct); + } } seq_putc(m, '\n'); @@ -4815,6 +4937,143 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable); } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +/** + * register_ftrace_direct - Call a custom trampoline directly + * @ip: The address of the nop at the beginning of a function + * @addr: The address of the trampoline to call at @ip + * + * This is used to connect a direct call from the nop location (@ip) + * at the start of ftrace traced functions. The location that it calls + * (@addr) must be able to handle a direct call, and save the parameters + * of the function being traced, and restore them (or inject new ones + * if needed), before returning. + * + * Returns: + * 0 on success + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *free_hash = NULL; + struct dyn_ftrace *rec; + int ret = -EBUSY; + + mutex_lock(&direct_mutex); + + /* See if there's a direct function at @ip already */ + if (find_rec_direct(ip)) + goto out_unlock; + + ret = -ENODEV; + rec = lookup_rec(ip, ip); + if (!rec) + goto out_unlock; + + /* + * Check if the rec says it has a direct call but we didn't + * find one earlier? + */ + if (WARN_ON(rec->flags & FTRACE_FL_DIRECT)) + goto out_unlock; + + /* Make sure the ip points to the exact record */ + ip = rec->ip; + + ret = -ENOMEM; + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + goto out_unlock; + + free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + goto out_unlock; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); + if (ret) + remove_hash_entry(direct_functions, entry); + + if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { + ret = register_ftrace_function(&direct_ops); + if (ret) + ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + } + + if (ret) + kfree(entry); + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct); + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) { + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + goto out_unlock; + } + + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + if (direct_functions->count == 1) + unregister_ftrace_function(&direct_ops); + + ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); + + WARN_ON(ret); + + remove_hash_entry(direct_functions, entry); + + out_unlock: + mutex_unlock(&direct_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address * @ops - the ops to set the filter with -- cgit v1.2.3 From 013bf0da0474816f57739daa006c8564ad7396a3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:11:27 -0500 Subject: ftrace: Add ftrace_find_direct_func() As function_graph tracer modifies the return address to insert a trampoline to trace the return of a function, it must be aware of a direct caller, as when it gets called, the function's return address may not be at on the stack where it expects. It may have to see if that return address points to the a direct caller and adjust if it is. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 329a3f3789a1..c4446eabacbe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4938,6 +4938,46 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, } #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + +struct ftrace_direct_func { + struct list_head next; + unsigned long addr; + int count; +}; + +static LIST_HEAD(ftrace_direct_funcs); + +/** + * ftrace_find_direct_func - test an address if it is a registered direct caller + * @addr: The address of a registered direct caller + * + * This searches to see if a ftrace direct caller has been registered + * at a specific address, and if so, it returns a descriptor for it. + * + * This can be used by architecture code to see if an address is + * a direct caller (trampoline) attached to a fentry/mcount location. + * This is useful for the function_graph tracer, as it may need to + * do adjustments if it traced a location that also has a direct + * trampoline attached to it. + */ +struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr) +{ + struct ftrace_direct_func *entry; + bool found = false; + + /* May be called by fgraph trampoline (protected by rcu tasks) */ + list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) { + if (entry->addr == addr) { + found = true; + break; + } + } + if (found) + return entry; + + return NULL; +} + /** * register_ftrace_direct - Call a custom trampoline directly * @ip: The address of the nop at the beginning of a function @@ -4957,6 +4997,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, */ int register_ftrace_direct(unsigned long ip, unsigned long addr) { + struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; struct ftrace_hash *free_hash = NULL; struct dyn_ftrace *rec; @@ -5005,6 +5046,18 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; + direct = ftrace_find_direct_func(addr); + if (!direct) { + direct = kmalloc(sizeof(*direct), GFP_KERNEL); + if (!direct) { + kfree(entry); + goto out_unlock; + } + direct->addr = addr; + direct->count = 0; + list_add_rcu(&direct->next, &ftrace_direct_funcs); + } + entry->ip = ip; entry->direct = addr; __add_hash_entry(direct_functions, entry); @@ -5019,8 +5072,20 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_set_filter_ip(&direct_ops, ip, 1, 0); } - if (ret) + if (ret) { kfree(entry); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + if (free_hash) + free_ftrace_hash(free_hash); + free_hash = NULL; + } + } else { + if (!direct->count) + direct->count++; + } out_unlock: mutex_unlock(&direct_mutex); @@ -5036,6 +5101,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_func_entry *entry; + struct ftrace_direct_func *direct; struct dyn_ftrace *rec; int ret = -ENODEV; @@ -5066,6 +5132,17 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) remove_hash_entry(direct_functions, entry); + direct = ftrace_find_direct_func(addr); + if (!WARN_ON(!direct)) { + /* This is the good path (see the ! before WARN) */ + direct->count--; + WARN_ON(direct->count < 0); + if (!direct->count) { + list_del_rcu(&direct->next); + synchronize_rcu_tasks(); + kfree(direct); + } + } out_unlock: mutex_unlock(&direct_mutex); -- cgit v1.2.3 From a3ad1a7e39689005cb04a4f2adb82f9d55b4724f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Nov 2019 13:12:57 -0500 Subject: ftrace/x86: Add a counter to test function_graph with direct As testing for direct calls from the function graph tracer adds a little overhead (which is a lot when tracing every function), add a counter that can be used to test if function_graph tracer needs to test for a direct caller or not. It would have been nicer if we could use a static branch, but the static branch logic fails when used within the function graph tracer trampoline. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c4446eabacbe..f9456346ec66 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2364,6 +2364,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) /* Protected by rcu_tasks for reading, and direct_mutex for writing */ static struct ftrace_hash *direct_functions = EMPTY_HASH; static DEFINE_MUTEX(direct_mutex); +int ftrace_direct_func_count; /* * Search the direct_functions hash to see if the given instruction pointer @@ -5056,6 +5057,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) direct->addr = addr; direct->count = 0; list_add_rcu(&direct->next, &ftrace_direct_funcs); + ftrace_direct_func_count++; } entry->ip = ip; @@ -5081,6 +5083,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) if (free_hash) free_ftrace_hash(free_hash); free_hash = NULL; + ftrace_direct_func_count--; } } else { if (!direct->count) @@ -5141,6 +5144,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) list_del_rcu(&direct->next); synchronize_rcu_tasks(); kfree(direct); + ftrace_direct_func_count--; } } out_unlock: -- cgit v1.2.3 From da537f0aef1372c5204356a7df06be8769467b7b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Oct 2019 14:38:07 -0400 Subject: ftrace: Add information on number of page groups allocated Looking for ways to shrink the size of the dyn_ftrace structure, knowing the information about how many pages and the number of groups of those pages, is useful in working out the best ways to save on memory. This adds one info print on how many groups of pages were used to allocate the ftrace dyn_ftrace structures, and also shows the number of pages and groups in the dyn_ftrace_total_info (which is used for debugging). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 14 ++++++++++++++ kernel/trace/trace.c | 21 +++++++++++++++------ kernel/trace/trace.h | 2 ++ 3 files changed, 31 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f9456346ec66..d2d488c43a6a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2991,6 +2991,8 @@ static void ftrace_shutdown_sysctl(void) static u64 ftrace_update_time; unsigned long ftrace_update_tot_cnt; +unsigned long ftrace_number_of_pages; +unsigned long ftrace_number_of_groups; static inline int ops_traces_mod(struct ftrace_ops *ops) { @@ -3115,6 +3117,9 @@ static int ftrace_allocate_records(struct ftrace_page *pg, int count) goto again; } + ftrace_number_of_pages += 1 << order; + ftrace_number_of_groups++; + cnt = (PAGE_SIZE << order) / ENTRY_SIZE; pg->size = cnt; @@ -3170,6 +3175,8 @@ ftrace_allocate_pages(unsigned long num_to_init) start_pg = pg->next; kfree(pg); pg = start_pg; + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } pr_info("ftrace: FAILED to allocate memory for functions\n"); return NULL; @@ -6173,6 +6180,8 @@ void ftrace_release_mod(struct module *mod) free_pages((unsigned long)pg->records, order); tmp_page = pg->next; kfree(pg); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; } } @@ -6514,6 +6523,8 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr) *last_pg = pg->next; order = get_count_order(pg->size / ENTRIES_PER_PAGE); free_pages((unsigned long)pg->records, order); + ftrace_number_of_pages -= 1 << order; + ftrace_number_of_groups--; kfree(pg); pg = container_of(last_pg, struct ftrace_page, next); if (!(*last_pg)) @@ -6569,6 +6580,9 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + pr_info("ftrace: allocated %ld pages with %ld groups\n", + ftrace_number_of_pages, ftrace_number_of_groups); + set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a0ee9178365..5ea8c7c0f2d7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -7583,14 +7583,23 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - unsigned long *p = filp->private_data; - char buf[64]; /* Not too big for a shallow stack */ + ssize_t ret; + char *buf; int r; - r = scnprintf(buf, 63, "%ld", *p); - buf[r++] = '\n'; + /* 256 should be plenty to hold the amount needed */ + buf = kmalloc(256, GFP_KERNEL); + if (!buf) + return -ENOMEM; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r = scnprintf(buf, 256, "%ld pages:%ld groups: %ld\n", + ftrace_update_tot_cnt, + ftrace_number_of_pages, + ftrace_number_of_groups); + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + kfree(buf); + return ret; } static const struct file_operations tracing_dyn_info_fops = { @@ -8782,7 +8791,7 @@ static __init int tracer_init_tracefs(void) #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, &tracing_dyn_info_fops); + NULL, &tracing_dyn_info_fops); #endif create_trace_instances(d_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d685c61085c0..8b590f10bc72 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -804,6 +804,8 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +extern unsigned long ftrace_number_of_pages; +extern unsigned long ftrace_number_of_groups; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } -- cgit v1.2.3 From 91edde2e6ae1dd5e33812f076f3fe4cb7ccbfdd0 Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:21 +0200 Subject: ftrace: Implement fs notification for tracing_max_latency This patch implements the feature that the tracing_max_latency file, e.g. /sys/kernel/debug/tracing/tracing_max_latency will receive notifications through the fsnotify framework when a new latency is available. One particularly interesting use of this facility is when enabling threshold tracing, through /sys/kernel/debug/tracing/tracing_thresh, together with the preempt/irqsoff tracers. This makes it possible to implement a user space program that can, with equal probability, obtain traces of latencies that occur immediately after each other in spite of the fact that the preempt/irqsoff tracers operate in overwrite mode. This facility works with the hwlat, preempt/irqsoff, and wakeup tracers. The tracers may call the latency_fsnotify() from places such as __schedule() or do_idle(); this makes it impossible to call queue_work() directly without risking a deadlock. The same would happen with a softirq, kernel thread or tasklet. For this reason we use the irq_work mechanism to call queue_work(). This patch creates a new workqueue. The reason for doing this is that I wanted to use the WQ_UNBOUND and WQ_HIGHPRI flags. My thinking was that WQ_UNBOUND might help with the latency in some important cases. If we use: queue_work(system_highpri_wq, &tr->fsnotify_work); then the work will (almost) always execute on the same CPU but if we are unlucky that CPU could be too busy while there could be another CPU in the system that would be able to process the work soon enough. queue_work_on() could be used to queue the work on another CPU but it seems difficult to select the right CPU. Link: http://lkml.kernel.org/r/20191008220824.7911-2-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) [ Added max() to have one compare for max latency ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 75 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/trace/trace.h | 18 +++++++++++ kernel/trace/trace_hwlat.c | 11 ++++--- 3 files changed, 98 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5ea8c7c0f2d7..f093a433cb42 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -45,6 +45,9 @@ #include #include #include +#include +#include +#include #include "trace.h" #include "trace_output.h" @@ -1497,6 +1500,74 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) } unsigned long __read_mostly tracing_thresh; +static const struct file_operations tracing_max_lat_fops; + +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +static struct workqueue_struct *fsnotify_wq; + +static void latency_fsnotify_workfn(struct work_struct *work) +{ + struct trace_array *tr = container_of(work, struct trace_array, + fsnotify_work); + fsnotify(tr->d_max_latency->d_inode, FS_MODIFY, + tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0); +} + +static void latency_fsnotify_workfn_irq(struct irq_work *iwork) +{ + struct trace_array *tr = container_of(iwork, struct trace_array, + fsnotify_irqwork); + queue_work(fsnotify_wq, &tr->fsnotify_work); +} + +static void trace_create_maxlat_file(struct trace_array *tr, + struct dentry *d_tracer) +{ + INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); + init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); + tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + d_tracer, &tr->max_latency, + &tracing_max_lat_fops); +} + +__init static int latency_fsnotify_init(void) +{ + fsnotify_wq = alloc_workqueue("tr_max_lat_wq", + WQ_UNBOUND | WQ_HIGHPRI, 0); + if (!fsnotify_wq) { + pr_err("Unable to allocate tr_max_lat_wq\n"); + return -ENOMEM; + } + return 0; +} + +late_initcall_sync(latency_fsnotify_init); + +void latency_fsnotify(struct trace_array *tr) +{ + if (!fsnotify_wq) + return; + /* + * We cannot call queue_work(&tr->fsnotify_work) from here because it's + * possible that we are called from __schedule() or do_idle(), which + * could cause a deadlock. + */ + irq_work_queue(&tr->fsnotify_irqwork); +} + +/* + * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + * defined(CONFIG_FSNOTIFY) + */ +#else + +#define trace_create_maxlat_file(tr, d_tracer) \ + trace_create_file("tracing_max_latency", 0644, d_tracer, \ + &tr->max_latency, &tracing_max_lat_fops) + +#endif #ifdef CONFIG_TRACER_MAX_TRACE /* @@ -1536,6 +1607,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) /* record this tasks comm */ tracing_record_cmdline(tsk); + latency_fsnotify(tr); } /** @@ -8594,8 +8666,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) - trace_create_file("tracing_max_latency", 0644, d_tracer, - &tr->max_latency, &tracing_max_lat_fops); + trace_create_maxlat_file(tr, d_tracer); #endif if (ftrace_create_function_files(tr, d_tracer)) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8b590f10bc72..718eb998c13e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ @@ -264,6 +266,11 @@ struct trace_array { #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; +#ifdef CONFIG_FSNOTIFY + struct dentry *d_max_latency; + struct work_struct fsnotify_work; + struct irq_work fsnotify_irqwork; +#endif #endif struct trace_pid_list __rcu *filtered_pids; /* @@ -786,6 +793,17 @@ void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ +#if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ + defined(CONFIG_FSNOTIFY) + +void latency_fsnotify(struct trace_array *tr); + +#else + +static void latency_fsnotify(struct trace_array *tr) { } + +#endif + #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 862f4b0139fc..63526670605a 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -237,6 +237,7 @@ static int get_sample(void) /* If we exceed the threshold value, we have found a hardware latency */ if (sample > thresh || outer_sample > thresh) { struct hwlat_sample s; + u64 latency; ret = 1; @@ -253,11 +254,13 @@ static int get_sample(void) s.nmi_count = nmi_count; trace_hwlat_sample(&s); + latency = max(sample, outer_sample); + /* Keep a running maximum ever recorded hardware latency */ - if (sample > tr->max_latency) - tr->max_latency = sample; - if (outer_sample > tr->max_latency) - tr->max_latency = outer_sample; + if (latency > tr->max_latency) { + tr->max_latency = latency; + latency_fsnotify(tr); + } } out: -- cgit v1.2.3 From 793937236d1ee032d2ee5ccc27bdd280a04e766e Mon Sep 17 00:00:00 2001 From: "Viktor Rosendahl (BMW)" Date: Wed, 9 Oct 2019 00:08:22 +0200 Subject: preemptirq_delay_test: Add the burst feature and a sysfs trigger This burst feature enables the user to generate a burst of preempt/irqsoff latencies. This makes it possible to test whether we are able to detect latencies that systematically occur very close to each other. The maximum burst size is 10. We also create 10 identical test functions, so that we get 10 different backtraces; this is useful when we want to test whether we can detect all the latencies in a burst. Otherwise, there would be no easy way of differentiating between which latency in a burst was captured by the tracer. In addition, there is a sysfs trigger, so that it's not necessary to reload the module to repeat the test. The trigger will appear as /sys/kernel/preemptirq_delay_test/trigger in sysfs. Link: http://lkml.kernel.org/r/20191008220824.7911-3-viktor.rosendahl@gmail.com Reviewed-by: Joel Fernandes (Google) Signed-off-by: Viktor Rosendahl (BMW) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 6 +- kernel/trace/preemptirq_delay_test.c | 144 ++++++++++++++++++++++++++++++----- 2 files changed, 128 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 624a05e99b0b..d25314bc7a1c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -760,9 +760,9 @@ config PREEMPTIRQ_DELAY_TEST configurable delay. The module busy waits for the duration of the critical section. - For example, the following invocation forces a one-time irq-disabled - critical section for 500us: - modprobe preemptirq_delay_test test_mode=irq delay=500000 + For example, the following invocation generates a burst of three + irq-disabled critical sections for 500us: + modprobe preemptirq_delay_test test_mode=irq delay=500 burst_size=3 If unsure, say N diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c index d8765c952fab..31c0fad4cb9e 100644 --- a/kernel/trace/preemptirq_delay_test.c +++ b/kernel/trace/preemptirq_delay_test.c @@ -10,18 +10,25 @@ #include #include #include +#include #include #include #include #include +#include static ulong delay = 100; -static char test_mode[10] = "irq"; +static char test_mode[12] = "irq"; +static uint burst_size = 1; -module_param_named(delay, delay, ulong, S_IRUGO); -module_param_string(test_mode, test_mode, 10, S_IRUGO); -MODULE_PARM_DESC(delay, "Period in microseconds (100 uS default)"); -MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt or irq (default irq)"); +module_param_named(delay, delay, ulong, 0444); +module_param_string(test_mode, test_mode, 12, 0444); +module_param_named(burst_size, burst_size, uint, 0444); +MODULE_PARM_DESC(delay, "Period in microseconds (100 us default)"); +MODULE_PARM_DESC(test_mode, "Mode of the test such as preempt, irq, or alternate (default irq)"); +MODULE_PARM_DESC(burst_size, "The size of a burst (default 1)"); + +#define MIN(x, y) ((x) < (y) ? (x) : (y)) static void busy_wait(ulong time) { @@ -34,37 +41,136 @@ static void busy_wait(ulong time) } while ((end - start) < (time * 1000)); } -static int preemptirq_delay_run(void *data) +static __always_inline void irqoff_test(void) { unsigned long flags; + local_irq_save(flags); + busy_wait(delay); + local_irq_restore(flags); +} - if (!strcmp(test_mode, "irq")) { - local_irq_save(flags); - busy_wait(delay); - local_irq_restore(flags); - } else if (!strcmp(test_mode, "preempt")) { - preempt_disable(); - busy_wait(delay); - preempt_enable(); +static __always_inline void preemptoff_test(void) +{ + preempt_disable(); + busy_wait(delay); + preempt_enable(); +} + +static void execute_preemptirqtest(int idx) +{ + if (!strcmp(test_mode, "irq")) + irqoff_test(); + else if (!strcmp(test_mode, "preempt")) + preemptoff_test(); + else if (!strcmp(test_mode, "alternate")) { + if (idx % 2 == 0) + irqoff_test(); + else + preemptoff_test(); } +} + +#define DECLARE_TESTFN(POSTFIX) \ + static void preemptirqtest_##POSTFIX(int idx) \ + { \ + execute_preemptirqtest(idx); \ + } \ +/* + * We create 10 different functions, so that we can get 10 different + * backtraces. + */ +DECLARE_TESTFN(0) +DECLARE_TESTFN(1) +DECLARE_TESTFN(2) +DECLARE_TESTFN(3) +DECLARE_TESTFN(4) +DECLARE_TESTFN(5) +DECLARE_TESTFN(6) +DECLARE_TESTFN(7) +DECLARE_TESTFN(8) +DECLARE_TESTFN(9) + +static void (*testfuncs[])(int) = { + preemptirqtest_0, + preemptirqtest_1, + preemptirqtest_2, + preemptirqtest_3, + preemptirqtest_4, + preemptirqtest_5, + preemptirqtest_6, + preemptirqtest_7, + preemptirqtest_8, + preemptirqtest_9, +}; + +#define NR_TEST_FUNCS ARRAY_SIZE(testfuncs) + +static int preemptirq_delay_run(void *data) +{ + int i; + int s = MIN(burst_size, NR_TEST_FUNCS); + + for (i = 0; i < s; i++) + (testfuncs[i])(i); return 0; } -static int __init preemptirq_delay_init(void) +static struct task_struct *preemptirq_start_test(void) { char task_name[50]; - struct task_struct *test_task; snprintf(task_name, sizeof(task_name), "%s_test", test_mode); + return kthread_run(preemptirq_delay_run, NULL, task_name); +} + + +static ssize_t trigger_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + preemptirq_start_test(); + return count; +} + +static struct kobj_attribute trigger_attribute = + __ATTR(trigger, 0200, NULL, trigger_store); + +static struct attribute *attrs[] = { + &trigger_attribute.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +static struct kobject *preemptirq_delay_kobj; + +static int __init preemptirq_delay_init(void) +{ + struct task_struct *test_task; + int retval; + + test_task = preemptirq_start_test(); + retval = PTR_ERR_OR_ZERO(test_task); + if (retval != 0) + return retval; + + preemptirq_delay_kobj = kobject_create_and_add("preemptirq_delay_test", + kernel_kobj); + if (!preemptirq_delay_kobj) + return -ENOMEM; + + retval = sysfs_create_group(preemptirq_delay_kobj, &attr_group); + if (retval) + kobject_put(preemptirq_delay_kobj); - test_task = kthread_run(preemptirq_delay_run, NULL, task_name); - return PTR_ERR_OR_ZERO(test_task); + return retval; } static void __exit preemptirq_delay_exit(void) { - return; + kobject_put(preemptirq_delay_kobj); } module_init(preemptirq_delay_init) -- cgit v1.2.3 From 9c34fc4b7e903117cc27712b9e6c8690debb7e95 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:20 +0200 Subject: tracing: Use CONFIG_PREEMPTION CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same functionality which today depends on CONFIG_PREEMPT. Add additional header output for PREEMPT_RT. Link: http://lkml.kernel.org/r/20191015191821.11479-34-bigeasy@linutronix.de Cc: Ingo Molnar Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f093a433cb42..db7d06a26861 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3726,6 +3726,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) "desktop", #elif defined(CONFIG_PREEMPT) "preempt", +#elif defined(CONFIG_PREEMPT_RT) + "preempt_rt", #else "unknown", #endif -- cgit v1.2.3 From 6dff4d7dd3e0158688683a17dd792861aa9d61e2 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 15 Oct 2019 13:10:12 +0100 Subject: tracing: Make internal ftrace events static The event_class_ftrace_##call and event_##call do not seem to be used outside of trace_export.c so make them both static to avoid a number of sparse warnings: kernel/trace/trace_entries.h:59:1: warning: symbol 'event_class_ftrace_function' was not declared. Should it be static? kernel/trace/trace_entries.h:59:1: warning: symbol '__event_function' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol 'event_class_ftrace_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:77:1: warning: symbol '__event_funcgraph_entry' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol 'event_class_ftrace_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:93:1: warning: symbol '__event_funcgraph_exit' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol 'event_class_ftrace_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:129:1: warning: symbol '__event_context_switch' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol 'event_class_ftrace_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:149:1: warning: symbol '__event_wakeup' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol 'event_class_ftrace_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:171:1: warning: symbol '__event_kernel_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol 'event_class_ftrace_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:191:1: warning: symbol '__event_user_stack' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol 'event_class_ftrace_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:214:1: warning: symbol '__event_bprint' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol 'event_class_ftrace_print' was not declared. Should it be static? kernel/trace/trace_entries.h:230:1: warning: symbol '__event_print' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol 'event_class_ftrace_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:247:1: warning: symbol '__event_raw_data' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol 'event_class_ftrace_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:262:1: warning: symbol '__event_bputs' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol 'event_class_ftrace_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:277:1: warning: symbol '__event_mmiotrace_rw' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol 'event_class_ftrace_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:298:1: warning: symbol '__event_mmiotrace_map' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol 'event_class_ftrace_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:322:1: warning: symbol '__event_branch' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol 'event_class_ftrace_hwlat' was not declared. Should it be static? kernel/trace/trace_entries.h:343:1: warning: symbol '__event_hwlat' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191015121012.18824-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_export.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 45630a76ed3a..2e6d2e9741cc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -171,7 +171,7 @@ ftrace_define_fields_##name(struct trace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct trace_event_class __refdata event_class_ftrace_##call = { \ +static struct trace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ @@ -187,7 +187,7 @@ struct trace_event_call __used event_##call = { \ .print_fmt = print, \ .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ -struct trace_event_call __used \ +static struct trace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -- cgit v1.2.3 From 2d6425af61166e026e7476db64f70f1266127b1d Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:23 -0700 Subject: tracing: Declare newly exported APIs in include/linux/trace.h Declare the newly introduced and exported APIs in the header file - include/linux/trace.h. Moving previous declarations from kernel/trace/trace.h to include/linux/trace.h. Link: http://lkml.kernel.org/r/1565805327-579-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 718eb998c13e..90cba68c8b50 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -873,8 +874,6 @@ trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...); int trace_array_printk_buf(struct ring_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); @@ -1890,7 +1889,6 @@ extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); -void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); -- cgit v1.2.3 From e585e6469d6f476b82aa148dc44aaf7ae269a4e2 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:24 -0700 Subject: tracing: Verify if trace array exists before destroying it. A trace array can be destroyed from userspace or kernel. Verify if the trace array exists before proceeding to destroy/remove it. Link: http://lkml.kernel.org/r/1565805327-579-3-git-send-email-divya.indi@oracle.com Reviewed-by: Aruna Ramakrishna Signed-off-by: Divya Indi [ Removed unneeded braces ] Signed-off-by: Steven Rostedt (VMware) --- kernel/module.c | 6 +++++- kernel/trace/trace.c | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ff2d7359a418..6e2fd40a6ed9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3728,7 +3728,6 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_ro(mod, false); module_enable_nx(mod); - module_enable_x(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3751,6 +3750,11 @@ static int prepare_coming_module(struct module *mod) if (err) return err; + /* Make module executable after ftrace is enabled */ + mutex_lock(&module_mutex); + module_enable_x(mod); + mutex_unlock(&module_mutex); + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index db7d06a26861..fa4f742fc449 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8556,17 +8556,26 @@ static int __remove_instance(struct trace_array *tr) return 0; } -int trace_array_destroy(struct trace_array *tr) +int trace_array_destroy(struct trace_array *this_tr) { + struct trace_array *tr; int ret; - if (!tr) + if (!this_tr) return -EINVAL; mutex_lock(&event_mutex); mutex_lock(&trace_types_lock); - ret = __remove_instance(tr); + ret = -ENODEV; + + /* Making sure trace array exists before destroying it. */ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr == this_tr) { + ret = __remove_instance(tr); + break; + } + } mutex_unlock(&trace_types_lock); mutex_unlock(&event_mutex); -- cgit v1.2.3 From 953ae45a0c25e09428d4a03d7654f97ab8a36647 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 14 Aug 2019 10:55:25 -0700 Subject: tracing: Adding NULL checks for trace_array descriptor pointer As part of commit f45d1225adb0 ("tracing: Kernel access to Ftrace instances") we exported certain functions. Here, we are adding some additional NULL checks to ensure safe usage by users of these APIs. Link: http://lkml.kernel.org/r/1565805327-579-4-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index fa4f742fc449..79fe4d6ecbd8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3297,6 +3297,9 @@ int trace_array_printk(struct trace_array *tr, if (!(global_trace.trace_flags & TRACE_ITER_PRINTK)) return 0; + if (!tr) + return -ENOENT; + va_start(ap, fmt); ret = trace_array_vprintk(tr, ip, fmt, ap); va_end(ap); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index fba87d10f0c1..2a3ac2365445 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -793,6 +793,8 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) char *event = NULL, *sub = NULL, *match; int ret; + if (!tr) + return -ENOENT; /* * The buf format can be : * *: means any event by that name. -- cgit v1.2.3 From b83b43ffc6e4b514ca034a0fbdee01322e2f7022 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 15 Oct 2019 09:00:55 -0400 Subject: fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub The C compiler is allowing more checks to make sure that function pointers are assigned to the correct prototype function. Unfortunately, the function graph tracer uses a special name with its assigned ftrace_graph_return function pointer that maps to a stub function used by the function tracer (ftrace_stub). The ftrace_graph_return variable is compared to the ftrace_stub in some archs to know if the function graph tracer is enabled or not. This means we can not just simply create a new function stub that compares it without modifying all the archs. Instead, have the linker script create a function_graph_stub that maps to ftrace_stub, and this way we can define the prototype for it to match the prototype of ftrace_graph_return, and make the compiler checks all happy! Link: http://lkml.kernel.org/r/20191015090055.789a0aed@gandalf.local.home Cc: linux-sh@vger.kernel.org Cc: Yoshinori Sato Cc: Rich Felker Reported-by: Sami Tolvanen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 7950a0356042..fa3ce10d0405 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -332,9 +332,14 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) return 0; } +/* + * Simply points to ftrace_stub, but with the proper protocol. + * Defined by the linker script in linux/vmlinux.lds.h + */ +extern void ftrace_graph_stub(struct ftrace_graph_ret *); + /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = - (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -614,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_return = ftrace_graph_stub; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit v1.2.3 From 80042c8f06bf5a7b87a63deaa3deb56f2cd52645 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Oct 2019 16:56:56 +0300 Subject: tracing: Use generic type for comparator function Comparator function type, cmp_func_t, is defined in the types.h, use it in the code. Link: http://lkml.kernel.org/r/20191007135656.37734-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 12 ++++++------ kernel/trace/trace_branch.c | 8 ++++---- kernel/trace/trace_stat.c | 6 ++---- kernel/trace/trace_stat.h | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d2d488c43a6a..82ef8d60a42b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -465,10 +465,10 @@ static void *function_stat_start(struct tracer_stat *trace) #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* function graph compares on total time */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->time < b->time) return -1; @@ -479,10 +479,10 @@ static int function_stat_cmp(void *p1, void *p2) } #else /* not function graph compares against hits */ -static int function_stat_cmp(void *p1, void *p2) +static int function_stat_cmp(const void *p1, const void *p2) { - struct ftrace_profile *a = p1; - struct ftrace_profile *b = p2; + const struct ftrace_profile *a = p1; + const struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 3ea65cdff30d..88e158d27965 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -244,7 +244,7 @@ static int annotated_branch_stat_headers(struct seq_file *m) return 0; } -static inline long get_incorrect_percent(struct ftrace_branch_data *p) +static inline long get_incorrect_percent(const struct ftrace_branch_data *p) { long percent; @@ -332,10 +332,10 @@ annotated_branch_stat_next(void *v, int idx) return p; } -static int annotated_branch_stat_cmp(void *p1, void *p2) +static int annotated_branch_stat_cmp(const void *p1, const void *p2) { - struct ftrace_branch_data *a = p1; - struct ftrace_branch_data *b = p2; + const struct ftrace_branch_data *a = p1; + const struct ftrace_branch_data *b = p2; long percent_a, percent_b; diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 9ab0a1a7ad5e..874f1274cf99 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -72,9 +72,7 @@ static void destroy_session(struct stat_session *session) kfree(session); } -typedef int (*cmp_stat_t)(void *, void *); - -static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_func_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; struct stat_node *data; @@ -112,7 +110,7 @@ static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) * This one will force an insertion as right-most node * in the rbtree. */ -static int dummy_cmp(void *p1, void *p2) +static int dummy_cmp(const void *p1, const void *p2) { return -1; } diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 8786d17caf49..31d7dc5bf1db 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -16,7 +16,7 @@ struct tracer_stat { void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ - int (*stat_cmp)(void *p1, void *p2); + cmp_func_t stat_cmp; /* Print a stat entry */ int (*stat_show)(struct seq_file *s, void *p); /* Release an entry */ -- cgit v1.2.3 From 0c3c86bdc691c794a6154f8515b7fa82c82dfc4d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat (VMware)" Date: Thu, 10 Oct 2019 11:51:17 -0700 Subject: tracing/hwlat: Fix a few trivial nits Update the source file name in the comments, and fix a grammatical error. Link: http://lkml.kernel.org/r/157073346821.17189.8946944856026592247.stgit@srivatsa-ubuntu Signed-off-by: Srivatsa S. Bhat (VMware) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 63526670605a..6638d63f0921 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * trace_hwlatdetect.c - A simple Hardware Latency detector. + * trace_hwlat.c - A simple Hardware Latency detector. * * Use this tracer to detect large system latencies induced by the behavior of * certain underlying system hardware or firmware, independent of Linux itself. @@ -279,7 +279,7 @@ static void move_to_next_cpu(void) return; /* * If for some reason the user modifies the CPU affinity - * of this thread, than stop migrating for the duration + * of this thread, then stop migrating for the duration * of the current test. */ if (!cpumask_equal(current_mask, current->cpus_ptr)) -- cgit v1.2.3 From 6ee40511cb838f9ced002dff7131bca87e3ccbdd Mon Sep 17 00:00:00 2001 From: Yuming Han Date: Thu, 24 Oct 2019 11:34:30 +0800 Subject: tracing: use kvcalloc for tgid_map array allocation Fail to allocate memory for tgid_map, because it requires order-6 page. detail as: c3 sh: page allocation failure: order:6, mode:0x140c0c0(GFP_KERNEL), nodemask=(null) c3 sh cpuset=/ mems_allowed=0 c3 CPU: 3 PID: 5632 Comm: sh Tainted: G W O 4.14.133+ #10 c3 Hardware name: Generic DT based system c3 Backtrace: c3 [] (dump_backtrace) from [](show_stack+0x18/0x1c) c3 [] (show_stack) from [](dump_stack+0x84/0xa4) c3 [] (dump_stack) from [](warn_alloc+0xc4/0x19c) c3 [] (warn_alloc) from [](__alloc_pages_nodemask+0xd18/0xf28) c3 [] (__alloc_pages_nodemask) from [](kmalloc_order+0x20/0x38) c3 [] (kmalloc_order) from [](kmalloc_order_trace+0x24/0x108) c3 [] (kmalloc_order_trace) from [](set_tracer_flag+0xb0/0x158) c3 [] (set_tracer_flag) from [](trace_options_core_write+0x7c/0xcc) c3 [] (trace_options_core_write) from [](__vfs_write+0x40/0x14c) c3 [] (__vfs_write) from [](vfs_write+0xc4/0x198) c3 [] (vfs_write) from [](SyS_write+0x6c/0xd0) c3 [] (SyS_write) from [](ret_fast_syscall+0x0/0x54) Switch to use kvcalloc to avoid unexpected allocation failures. Link: http://lkml.kernel.org/r/1571888070-24425-1-git-send-email-chunyan.zhang@unisoc.com Signed-off-by: Yuming Han Signed-off-by: Chunyan Zhang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 79fe4d6ecbd8..42659ce6ac0c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4686,7 +4686,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_TGID) { if (!tgid_map) - tgid_map = kcalloc(PID_MAX_DEFAULT + 1, + tgid_map = kvcalloc(PID_MAX_DEFAULT + 1, sizeof(*tgid_map), GFP_KERNEL); if (!tgid_map) { -- cgit v1.2.3 From c7411a1a126f649be71526a36d4afac9e5aefa13 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 29 Oct 2019 17:31:44 +0900 Subject: tracing/kprobe: Check whether the non-suffixed symbol is notrace Check whether the non-suffixed symbol is notrace, since suffixed symbols are generated by the compilers for optimization. Based on these suffixed symbols, notrace check might not work because some of them are just a partial code of the original function. (e.g. cold-cache (unlikely) code is separated from original function as FUNCTION.cold.XX) For example, without this fix, # echo p device_add.cold.67 > /sys/kernel/debug/tracing/kprobe_events sh: write error: Invalid argument # cat /sys/kernel/debug/tracing/error_log [ 135.491035] trace_kprobe: error: Failed to register probe event Command: p device_add.cold.67 ^ # dmesg | tail -n 1 [ 135.488599] trace_kprobe: Could not probe notrace function device_add.cold.67 With this, # echo p device_add.cold.66 > /sys/kernel/debug/tracing/kprobe_events # cat /sys/kernel/debug/kprobes/list ffffffff81599de9 k device_add.cold.66+0x0 [DISABLED] Actually, kprobe blacklist already did similar thing, see within_kprobe_blacklist(). Link: http://lkml.kernel.org/r/157233790394.6706.18243942030937189679.stgit@devnote2 Fixes: 45408c4f9250 ("tracing: kprobes: Prohibit probing on notrace function") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1552a95c743b..7f890262c8a3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -435,11 +435,10 @@ static int disable_trace_kprobe(struct trace_event_call *call, #if defined(CONFIG_KPROBES_ON_FTRACE) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) -static bool within_notrace_func(struct trace_kprobe *tk) +static bool __within_notrace_func(unsigned long addr) { - unsigned long offset, size, addr; + unsigned long offset, size; - addr = trace_kprobe_address(tk); if (!addr || !kallsyms_lookup_size_offset(addr, &size, &offset)) return false; @@ -452,6 +451,28 @@ static bool within_notrace_func(struct trace_kprobe *tk) */ return !ftrace_location_range(addr, addr + size - 1); } + +static bool within_notrace_func(struct trace_kprobe *tk) +{ + unsigned long addr = addr = trace_kprobe_address(tk); + char symname[KSYM_NAME_LEN], *p; + + if (!__within_notrace_func(addr)) + return false; + + /* Check if the address is on a suffixed-symbol */ + if (!lookup_symbol_name(addr, symname)) { + p = strchr(symname, '.'); + if (!p) + return true; + *p = '\0'; + addr = (unsigned long)kprobe_lookup_name(symname, 0); + if (addr) + return __within_notrace_func(addr); + } + + return true; +} #else #define within_notrace_func(tk) (false) #endif -- cgit v1.2.3 From ef56e047b2bd4dabb801fd073dfcab5f40de5f78 Mon Sep 17 00:00:00 2001 From: Piotr Maziarz Date: Thu, 7 Nov 2019 13:45:38 +0100 Subject: tracing: Use seq_buf_hex_dump() to dump buffers Without this, buffers can be printed with __print_array macro that has no formatting options and can be hard to read. The other way is to mimic formatting capability with multiple calls of trace event with one call per row which gives performance impact and different timestamp in each row. Link: http://lkml.kernel.org/r/1573130738-29390-2-git-send-email-piotrx.maziarz@linux.intel.com Signed-off-by: Piotr Maziarz Signed-off-by: Cezary Rojewski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 15 +++++++++++++++ kernel/trace/trace_seq.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d54ce252b05a..d9b4b7c22db4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -274,6 +274,21 @@ trace_print_array_seq(struct trace_seq *p, const void *buf, int count, } EXPORT_SYMBOL(trace_print_array_seq); +const char * +trace_print_hex_dump_seq(struct trace_seq *p, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + const char *ret = trace_seq_buffer_ptr(p); + + trace_seq_putc(p, '\n'); + trace_seq_hex_dump(p, prefix_str, prefix_type, + rowsize, groupsize, buf, len, ascii); + trace_seq_putc(p, 0); + return ret; +} +EXPORT_SYMBOL(trace_print_hex_dump_seq); + int trace_raw_output_prep(struct trace_iterator *iter, struct trace_event *trace_event) { diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c index 6b1c562ffdaf..344e4c1aa09c 100644 --- a/kernel/trace/trace_seq.c +++ b/kernel/trace/trace_seq.c @@ -376,3 +376,33 @@ int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) return seq_buf_to_user(&s->seq, ubuf, cnt); } EXPORT_SYMBOL_GPL(trace_seq_to_user); + +int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, + int prefix_type, int rowsize, int groupsize, + const void *buf, size_t len, bool ascii) +{ + unsigned int save_len = s->seq.len; + + if (s->full) + return 0; + + __trace_seq_init(s); + + if (TRACE_SEQ_BUF_LEFT(s) < 1) { + s->full = 1; + return 0; + } + + seq_buf_hex_dump(&(s->seq), prefix_str, + prefix_type, rowsize, groupsize, + buf, len, ascii); + + if (unlikely(seq_buf_has_overflowed(&s->seq))) { + s->seq.len = save_len; + s->full = 1; + return 0; + } + + return 1; +} +EXPORT_SYMBOL(trace_seq_hex_dump); -- cgit v1.2.3 From 9b4712044d059e7842aaeeafd7c7a7ee88c589db Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Nov 2019 18:42:19 +0100 Subject: tracing: Remove stray tab in TRACE_EVAL_MAP_FILE's help text There was a stray tab in the help text of the aforementioned config option which showed like this: The "print fmt" of the trace events will show the enum/sizeof names instead of their values. This can cause problems for user space tools ... in menuconfig. Remove it and end a sentence with a fullstop. No functional changes. Link: http://lkml.kernel.org/r/20191112174219.10933-1-bp@alien8.de Signed-off-by: Borislav Petkov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d25314bc7a1c..b872716bb2a0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -771,7 +771,7 @@ config TRACE_EVAL_MAP_FILE depends on TRACING help The "print fmt" of the trace events will show the enum/sizeof names - instead of their values. This can cause problems for user space tools + instead of their values. This can cause problems for user space tools that use this string to parse the raw data as user space does not know how to convert the string to its value. @@ -792,7 +792,7 @@ config TRACE_EVAL_MAP_FILE they are needed for the "eval_map" file. Enabling this option will increase the memory footprint of the running kernel. - If unsure, say N + If unsure, say N. config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" -- cgit v1.2.3 From 36b3615dc3b625c8b587f34e413a600f7ac16403 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 22:43:58 -0500 Subject: tracing: Add missing "inline" in stub function of latency_fsnotify() The latency_fsnotify() stub when the function is not defined, was missing the "inline". Link: https://lore.kernel.org/r/20191115140213.74c5efe7@canb.auug.org.au Reported-by: Stephen Rothwell Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 90cba68c8b50..2df8aed6a8f0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -801,7 +801,7 @@ void latency_fsnotify(struct trace_array *tr); #else -static void latency_fsnotify(struct trace_array *tr) { } +static inline void latency_fsnotify(struct trace_array *tr) { } #endif -- cgit v1.2.3 From 0567d6809182df53da03636fad36c507c5cf07a5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Nov 2019 14:39:35 -0500 Subject: ftrace: Add modify_ftrace_direct() Add a new function modify_ftrace_direct() that will allow a user to update an existing direct caller to a new trampoline, without missing hits due to unregistering one and then adding another. Link: https://lore.kernel.org/r/20191109022907.6zzo6orhxpt5n2sv@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 82ef8d60a42b..834f3556ea1e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5160,6 +5160,84 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) return ret; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct); + +static struct ftrace_ops stub_ops = { + .func = ftrace_stub, +}; + +/** + * modify_ftrace_direct - Modify an existing direct call to call something else + * @ip: The instruction pointer to modify + * @old_addr: The address that the current @ip calls directly + * @new_addr: The address that the @ip should call + * + * This modifies a ftrace direct caller at an instruction pointer without + * having to disable it first. The direct call will switch over to the + * @new_addr without missing anything. + * + * Returns: zero on success. Non zero on error, which includes: + * -ENODEV : the @ip given has no direct caller attached + * -EINVAL : the @old_addr does not match the current direct caller + */ +int modify_ftrace_direct(unsigned long ip, + unsigned long old_addr, unsigned long new_addr) +{ + struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + entry = __ftrace_lookup_ip(direct_functions, ip); + if (!entry) { + /* OK if it is off by a little */ + rec = lookup_rec(ip, ip); + if (!rec || rec->ip == ip) + goto out_unlock; + + entry = __ftrace_lookup_ip(direct_functions, rec->ip); + if (!entry) + goto out_unlock; + + ip = rec->ip; + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); + } + + ret = -EINVAL; + if (entry->direct != old_addr) + goto out_unlock; + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_unlock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_unlock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + ret = 0; + + out_unlock: + mutex_unlock(&direct_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From e9838bd51169af87ae248336d4c3fc59184a0e46 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 13 Nov 2019 18:12:01 +0100 Subject: irq_work: Fix IRQ_WORK_BUSY bit clearing While attempting to clear the busy bit at the end of a work execution, atomic_cmpxchg() expects the value of the flags with the pending bit cleared as the old value. However by mistake the value of the flags is passed without clearing the pending bit first. As a result, clearing the busy bit fails and irq_work_sync() may stall: watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [blktrace:4948] CPU: 0 PID: 4948 Comm: blktrace Not tainted 5.4.0-rc7-00003-gfeb4a51323bab #1 RIP: 0010:irq_work_sync+0x4/0x10 Call Trace: relay_close_buf+0x19/0x50 relay_close+0x64/0x100 blk_trace_free+0x1f/0x50 __blk_trace_remove+0x1e/0x30 blk_trace_ioctl+0x11b/0x140 blkdev_ioctl+0x6c1/0xa40 block_ioctl+0x39/0x40 do_vfs_ioctl+0xa5/0x700 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1d0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 So clear the appropriate bit before passing the old flags to cmpxchg(). Fixes: feb4a51323ba ("irq_work: Slightly simplify IRQ_WORK_PENDING clearing") Reported-by: kernel test robot Reported-by: Leonard Crestez Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Leonard Crestez Link: https://lkml.kernel.org/r/20191113171201.14032-1-frederic@kernel.org --- kernel/irq_work.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 49c53f80a13a..828cc30774bc 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -158,6 +158,7 @@ static void irq_work_run_list(struct llist_head *list) * Clear the BUSY bit and return to the free state if * no-one else claimed it meanwhile. */ + flags &= ~IRQ_WORK_PENDING; (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); } } -- cgit v1.2.3 From 20a15ee040f23bd553d4e6bbb1f8724ccd282abc Mon Sep 17 00:00:00 2001 From: luanshi Date: Wed, 13 Nov 2019 22:41:33 +0800 Subject: genirq: Fix function documentation of __irq_alloc_descs() The function got renamed at some point, but the kernel-doc was not updated. Signed-off-by: Liguang Zhang Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1573656093-8643-1-git-send-email-zhangliguang@linux.alibaba.com --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9be995fc3c5a..5b8fdd659e54 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -750,7 +750,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt) EXPORT_SYMBOL_GPL(irq_free_descs); /** - * irq_alloc_descs - allocate and initialize a range of irq descriptors + * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. -- cgit v1.2.3 From 3ca47e958a64b1116a2c35e65dcf467fc53d52de Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Apr 2019 17:43:50 +0200 Subject: y2038: remove CONFIG_64BIT_TIME The CONFIG_64BIT_TIME option is defined on all architectures, and can be removed for simplicity now. Signed-off-by: Arnd Bergmann --- kernel/time/hrtimer.c | 2 +- kernel/time/time.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 65605530ee34..9e20873148c6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1940,7 +1940,7 @@ out: return ret; } -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..96b8c02657ed 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -267,7 +267,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, } #endif -#if !defined(CONFIG_64BIT_TIME) || defined(CONFIG_64BIT) +#ifdef CONFIG_64BIT SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p) { struct __kernel_timex txc; /* Local copy of parameter */ @@ -881,7 +881,7 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + if (in_compat_syscall()) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; -- cgit v1.2.3 From 2a785996cc5e2fc1d1d29d196f530905f68d2dc2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Nov 2019 11:10:01 +0100 Subject: y2038: uapi: change __kernel_time_t to __kernel_old_time_t This is mainly a patch for clarification, and to let us remove the time_t definition from the kernel to prevent new users from creeping in that might not be y2038-safe. All remaining uses of 'time_t' or '__kernel_time_t' are part of the user API that cannot be changed by that either have a replacement or that do not suffer from the y2038 overflow. Acked-by: Deepa Dinamani Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 96b8c02657ed..833abae3364f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -59,9 +59,9 @@ EXPORT_SYMBOL(sys_tz); * why not move it into the appropriate arch directory (for those * architectures that need it). */ -SYSCALL_DEFINE1(time, time_t __user *, tloc) +SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc) { - time_t i = (time_t)ktime_get_real_seconds(); + __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds(); if (tloc) { if (put_user(i,tloc)) @@ -78,7 +78,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc) * architectures that need it). */ -SYSCALL_DEFINE1(stime, time_t __user *, tptr) +SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr) { struct timespec64 tv; int err; -- cgit v1.2.3 From bdd565f817a74b9e30edec108f7cb1dbc762b8a6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 22:46:48 +0200 Subject: y2038: rusage: use __kernel_old_timeval There are two 'struct timeval' fields in 'struct rusage'. Unfortunately the definition of timeval is now ambiguous when used in user space with a libc that has a 64-bit time_t, and this also changes the 'rusage' definition in user space in a way that is incompatible with the system call interface. While there is no good solution to avoid all ambiguity here, change the definition in the kernel headers to be compatible with the kernel ABI, using __kernel_old_timeval as an unambiguous base type. In previous discussions, there was also a plan to add a replacement for rusage based on 64-bit timestamps and nanosecond resolution, i.e. 'struct __kernel_timespec'. I have patches for that as well, if anyone thinks we should do that. Reviewed-by: Cyrill Gorcunov Signed-off-by: Arnd Bergmann --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a611d1d58c7d..d3aef31e24dc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1763,8 +1763,8 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) unlock_task_sighand(p, &flags); out: - r->ru_utime = ns_to_timeval(utime); - r->ru_stime = ns_to_timeval(stime); + r->ru_utime = ns_to_kernel_old_timeval(utime); + r->ru_stime = ns_to_kernel_old_timeval(stime); if (who != RUSAGE_CHILDREN) { struct mm_struct *mm = get_task_mm(p); -- cgit v1.2.3 From 75d319c06e6a76f67549c0ae1007dc3167804f4e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 22:56:17 +0200 Subject: y2038: syscalls: change remaining timeval to __kernel_old_timeval All of the remaining syscalls that pass a timeval (gettimeofday, utime, futimesat) can trivially be changed to pass a __kernel_old_timeval instead, which has a compatible layout, but avoids ambiguity with the timeval type in user space. Acked-by: Christian Brauner Acked-by: Rafael J. Wysocki Signed-off-by: Arnd Bergmann --- kernel/power/power.h | 2 +- kernel/time/time.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 44bee462ff57..7cdc64dc2373 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -179,7 +179,7 @@ extern void swsusp_close(fmode_t); extern int swsusp_unmark(void); #endif -struct timeval; +struct __kernel_old_timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); diff --git a/kernel/time/time.c b/kernel/time/time.c index 833abae3364f..a0e7b9909f2d 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -137,7 +137,7 @@ SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr) #endif /* __ARCH_WANT_SYS_TIME32 */ #endif -SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { if (likely(tv != NULL)) { -- cgit v1.2.3 From 5e0fb1b57bea8d11fe77da2bc80f4c9a67e28318 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 15 Aug 2018 20:04:11 +0200 Subject: y2038: time: avoid timespec usage in settimeofday() The compat_get_timeval() and timeval_valid() interfaces are deprecated and getting removed along with the definition of struct timeval itself. Change the two implementations of the settimeofday() system call to open-code these helpers and completely avoid references to timeval. The timeval_valid() call is not needed any more here, only a check to avoid overflowing tv_nsec during the multiplication, as there is another range check in do_sys_settimeofday64(). Tested-by: syzbot+dccce9b26ba09ca49966@syzkaller.appspotmail.com Signed-off-by: Arnd Bergmann --- kernel/time/time.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index a0e7b9909f2d..58e312e7380f 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,22 +196,21 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz return 0; } -SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, +SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (copy_from_user(&user_tv, tv, sizeof(*tv))) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) @@ -245,18 +244,17 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, struct timezone __user *, tz) { struct timespec64 new_ts; - struct timeval user_tv; struct timezone new_tz; if (tv) { - if (compat_get_timeval(&user_tv, tv)) + if (get_user(new_ts.tv_sec, &tv->tv_sec) || + get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (!timeval_valid(&user_tv)) + if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; - new_ts.tv_sec = user_tv.tv_sec; - new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + new_ts.tv_nsec *= NSEC_PER_USEC; } if (tz) { if (copy_from_user(&new_tz, tz, sizeof(*tz))) -- cgit v1.2.3 From c1745f84be2657f5702388133551b759b9237f59 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 10:46:22 +0200 Subject: y2038: itimer: compat handling to itimer.c The structure is only used in one place, moving it there simplifies the interface and helps with later changes to this code. Rename it to match the other time32 structures in the process. Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann --- kernel/compat.c | 24 ------------------------ kernel/time/itimer.c | 42 +++++++++++++++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index a2bc1d6ceb57..95005f849c68 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -90,30 +90,6 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts) } EXPORT_SYMBOL_GPL(compat_put_timespec); -int get_compat_itimerval(struct itimerval *o, const struct compat_itimerval __user *i) -{ - struct compat_itimerval v32; - - if (copy_from_user(&v32, i, sizeof(struct compat_itimerval))) - return -EFAULT; - o->it_interval.tv_sec = v32.it_interval.tv_sec; - o->it_interval.tv_usec = v32.it_interval.tv_usec; - o->it_value.tv_sec = v32.it_value.tv_sec; - o->it_value.tv_usec = v32.it_value.tv_usec; - return 0; -} - -int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerval *i) -{ - struct compat_itimerval v32; - - v32.it_interval.tv_sec = i->it_interval.tv_sec; - v32.it_interval.tv_usec = i->it_interval.tv_usec; - v32.it_value.tv_sec = i->it_value.tv_sec; - v32.it_value.tv_usec = i->it_value.tv_usec; - return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0; -} - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 77f1e5635cc1..c52ebb40b60b 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -112,19 +112,34 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) } #ifdef CONFIG_COMPAT +struct old_itimerval32 { + struct old_timeval32 it_interval; + struct old_timeval32 it_value; +}; + +static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i) +{ + struct old_itimerval32 v32; + + v32.it_interval.tv_sec = i->it_interval.tv_sec; + v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_value.tv_sec = i->it_value.tv_sec; + v32.it_value.tv_usec = i->it_value.tv_usec; + return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; +} + COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct compat_itimerval __user *, it) + struct old_itimerval32 __user *, it) { struct itimerval kit; int error = do_getitimer(which, &kit); - if (!error && put_compat_itimerval(it, &kit)) + if (!error && put_old_itimerval32(it, &kit)) error = -EFAULT; return error; } #endif - /* * The timer is automagically restarted, when interval != 0 */ @@ -310,15 +325,28 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, } #ifdef CONFIG_COMPAT +static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) +{ + struct old_itimerval32 v32; + + if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) + return -EFAULT; + o->it_interval.tv_sec = v32.it_interval.tv_sec; + o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_value.tv_sec = v32.it_value.tv_sec; + o->it_value.tv_usec = v32.it_value.tv_usec; + return 0; +} + COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct compat_itimerval __user *, in, - struct compat_itimerval __user *, out) + struct old_itimerval32 __user *, in, + struct old_itimerval32 __user *, out) { struct itimerval kin, kout; int error; if (in) { - if (get_compat_itimerval(&kin, in)) + if (get_old_itimerval32(&kin, in)) return -EFAULT; } else { memset(&kin, 0, sizeof(kin)); @@ -327,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(setitimer, int, which, error = do_setitimer(which, &kin, out ? &kout : NULL); if (error || !out) return error; - if (put_compat_itimerval(out, &kout)) + if (put_old_itimerval32(out, &kout)) return -EFAULT; return 0; } -- cgit v1.2.3 From 4c22ea2b91203564fdf392b3d3cae249b652a8ae Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 16:59:39 +0200 Subject: y2038: use compat_{get,set}_itimer on alpha The itimer handling for the old alpha osf_setitimer/osf_getitimer system calls is identical to the compat version of getitimer/setitimer, so just use those directly. Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index c52ebb40b60b..4664c6addf69 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -111,7 +111,7 @@ SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) return error; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) struct old_itimerval32 { struct old_timeval32 it_interval; struct old_timeval32 it_value; @@ -324,7 +324,7 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, return 0; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; -- cgit v1.2.3 From ddbc7d0657e9fd38b69f16bd0310703367b52d29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 25 Oct 2019 21:37:43 +0200 Subject: y2038: move itimer reset into itimer.c Preparing for a change to the itimer internals, stop using the do_setitimer() symbol and instead use a new higher-level interface. The do_getitimer()/do_setitimer functions can now be made static, allowing the compiler to potentially produce better object code. Reviewed-by: Thomas Gleixner Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 4664c6addf69..ce9cd19ce72e 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -73,7 +73,7 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, value->it_interval = ns_to_timeval(interval); } -int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; @@ -197,7 +197,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; @@ -249,6 +249,17 @@ again: return 0; } +#ifdef CONFIG_SECURITY_SELINUX +void clear_itimer(void) +{ + struct itimerval v = {}; + int i; + + for (i = 0; i < 3; i++) + do_setitimer(i, &v, NULL); +} +#endif + #ifdef __ARCH_WANT_SYS_ALARM /** -- cgit v1.2.3 From bd40a175769d411b2a37e1c087082ac7ee2c15bb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 7 Nov 2019 15:27:39 +0100 Subject: y2038: itimer: change implementation to timespec64 There is no 64-bit version of getitimer/setitimer since that is not actually needed. However, the implementation is built around the deprecated 'struct timeval' type. Change the code to use timespec64 internally to reduce the dependencies on timeval and associated helper functions. Minor adjustments in the code are needed to make the native and compat version work the same way, and to keep the range check working after the conversion. Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 158 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 96 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ce9cd19ce72e..5872db9bd5f7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ * Returns the delta between the expiry time and now, which can be * less than zero or 1usec for an pending expired timer */ -static struct timeval itimer_get_remtime(struct hrtimer *timer) +static struct timespec64 itimer_get_remtime(struct hrtimer *timer) { ktime_t rem = __hrtimer_get_remaining(timer, true); @@ -41,11 +41,11 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) } else rem = 0; - return ktime_to_timeval(rem); + return ktime_to_timespec64(rem); } static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - struct itimerval *const value) + struct itimerspec64 *const value) { u64 val, interval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -69,11 +69,11 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); - value->it_value = ns_to_timeval(val); - value->it_interval = ns_to_timeval(interval); + value->it_value = ns_to_timespec64(val); + value->it_interval = ns_to_timespec64(interval); } -static int do_getitimer(int which, struct itimerval *value) +static int do_getitimer(int which, struct itimerspec64 *value) { struct task_struct *tsk = current; @@ -82,7 +82,7 @@ static int do_getitimer(int which, struct itimerval *value) spin_lock_irq(&tsk->sighand->siglock); value->it_value = itimer_get_remtime(&tsk->signal->real_timer); value->it_interval = - ktime_to_timeval(tsk->signal->it_real_incr); + ktime_to_timespec64(tsk->signal->it_real_incr); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: @@ -97,17 +97,26 @@ static int do_getitimer(int which, struct itimerval *value) return 0; } +static int put_itimerval(struct itimerval __user *o, + const struct itimerspec64 *i) +{ + struct itimerval v; + + v.it_interval.tv_sec = i->it_interval.tv_sec; + v.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; + v.it_value.tv_sec = i->it_value.tv_sec; + v.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; + return copy_to_user(o, &v, sizeof(struct itimerval)) ? -EFAULT : 0; +} + + SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) { - int error = -EFAULT; - struct itimerval get_buffer; + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (value) { - error = do_getitimer(which, &get_buffer); - if (!error && - copy_to_user(value, &get_buffer, sizeof(get_buffer))) - error = -EFAULT; - } + if (!error && put_itimerval(value, &get_buffer)) + error = -EFAULT; return error; } @@ -117,24 +126,25 @@ struct old_itimerval32 { struct old_timeval32 it_value; }; -static int put_old_itimerval32(struct old_itimerval32 __user *o, const struct itimerval *i) +static int put_old_itimerval32(struct old_itimerval32 __user *o, + const struct itimerspec64 *i) { struct old_itimerval32 v32; v32.it_interval.tv_sec = i->it_interval.tv_sec; - v32.it_interval.tv_usec = i->it_interval.tv_usec; + v32.it_interval.tv_usec = i->it_interval.tv_nsec / NSEC_PER_USEC; v32.it_value.tv_sec = i->it_value.tv_sec; - v32.it_value.tv_usec = i->it_value.tv_usec; + v32.it_value.tv_usec = i->it_value.tv_nsec / NSEC_PER_USEC; return copy_to_user(o, &v32, sizeof(struct old_itimerval32)) ? -EFAULT : 0; } COMPAT_SYSCALL_DEFINE2(getitimer, int, which, - struct old_itimerval32 __user *, it) + struct old_itimerval32 __user *, value) { - struct itimerval kit; - int error = do_getitimer(which, &kit); + struct itimerspec64 get_buffer; + int error = do_getitimer(which, &get_buffer); - if (!error && put_old_itimerval32(it, &kit)) + if (!error && put_old_itimerval32(value, &get_buffer)) error = -EFAULT; return error; } @@ -156,8 +166,8 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) } static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, - const struct itimerval *const value, - struct itimerval *const ovalue) + const struct itimerspec64 *const value, + struct itimerspec64 *const ovalue) { u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; @@ -166,8 +176,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, * Use the to_ktime conversion because that clamps the maximum * value to KTIME_MAX and avoid multiplication overflows. */ - nval = ktime_to_ns(timeval_to_ktime(value->it_value)); - ninterval = ktime_to_ns(timeval_to_ktime(value->it_interval)); + nval = timespec64_to_ns(&value->it_value); + ninterval = timespec64_to_ns(&value->it_interval); spin_lock_irq(&tsk->sighand->siglock); @@ -186,8 +196,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { - ovalue->it_value = ns_to_timeval(oval); - ovalue->it_interval = ns_to_timeval(ointerval); + ovalue->it_value = ns_to_timespec64(oval); + ovalue->it_interval = ns_to_timespec64(ointerval); } } @@ -197,19 +207,13 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, #define timeval_valid(t) \ (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) -static int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +static int do_setitimer(int which, struct itimerspec64 *value, + struct itimerspec64 *ovalue) { struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - /* - * Validate the timevals in value. - */ - if (!timeval_valid(&value->it_value) || - !timeval_valid(&value->it_interval)) - return -EINVAL; - switch (which) { case ITIMER_REAL: again: @@ -218,7 +222,7 @@ again: if (ovalue) { ovalue->it_value = itimer_get_remtime(timer); ovalue->it_interval - = ktime_to_timeval(tsk->signal->it_real_incr); + = ktime_to_timespec64(tsk->signal->it_real_incr); } /* We are sharing ->siglock with it_real_fn() */ if (hrtimer_try_to_cancel(timer) < 0) { @@ -226,10 +230,10 @@ again: hrtimer_cancel_wait_running(timer); goto again; } - expires = timeval_to_ktime(value->it_value); + expires = timespec64_to_ktime(value->it_value); if (expires != 0) { tsk->signal->it_real_incr = - timeval_to_ktime(value->it_interval); + timespec64_to_ktime(value->it_interval); hrtimer_start(timer, expires, HRTIMER_MODE_REL); } else tsk->signal->it_real_incr = 0; @@ -252,7 +256,7 @@ again: #ifdef CONFIG_SECURITY_SELINUX void clear_itimer(void) { - struct itimerval v = {}; + struct itimerspec64 v = {}; int i; for (i = 0; i < 3; i++) @@ -276,15 +280,15 @@ void clear_itimer(void) */ static unsigned int alarm_setitimer(unsigned int seconds) { - struct itimerval it_new, it_old; + struct itimerspec64 it_new, it_old; #if BITS_PER_LONG < 64 if (seconds > INT_MAX) seconds = INT_MAX; #endif it_new.it_value.tv_sec = seconds; - it_new.it_value.tv_usec = 0; - it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + it_new.it_value.tv_nsec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_nsec = 0; do_setitimer(ITIMER_REAL, &it_new, &it_old); @@ -292,8 +296,8 @@ static unsigned int alarm_setitimer(unsigned int seconds) * We can't return 0 if we have an alarm pending ... And we'd * better return too much than too little anyway */ - if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || - it_old.it_value.tv_usec >= 500000) + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || + it_old.it_value.tv_nsec >= 500000) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; @@ -310,15 +314,35 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds) #endif +static int get_itimerval(struct itimerspec64 *o, const struct itimerval __user *i) +{ + struct itimerval v; + + if (copy_from_user(&v, i, sizeof(struct itimerval))) + return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v.it_value) || + !timeval_valid(&v.it_interval)) + return -EINVAL; + + o->it_interval.tv_sec = v.it_interval.tv_sec; + o->it_interval.tv_nsec = v.it_interval.tv_usec * NSEC_PER_USEC; + o->it_value.tv_sec = v.it_value.tv_sec; + o->it_value.tv_nsec = v.it_value.tv_usec * NSEC_PER_USEC; + return 0; +} + SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, struct itimerval __user *, ovalue) { - struct itimerval set_buffer, get_buffer; + struct itimerspec64 set_buffer, get_buffer; int error; if (value) { - if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) - return -EFAULT; + error = get_itimerval(&set_buffer, value); + if (error) + return error; } else { memset(&set_buffer, 0, sizeof(set_buffer)); printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." @@ -330,43 +354,53 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, if (error || !ovalue) return error; - if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + if (put_itimerval(ovalue, &get_buffer)) return -EFAULT; return 0; } #if defined(CONFIG_COMPAT) || defined(CONFIG_ALPHA) -static int get_old_itimerval32(struct itimerval *o, const struct old_itimerval32 __user *i) +static int get_old_itimerval32(struct itimerspec64 *o, const struct old_itimerval32 __user *i) { struct old_itimerval32 v32; if (copy_from_user(&v32, i, sizeof(struct old_itimerval32))) return -EFAULT; + + /* Validate the timevals in value. */ + if (!timeval_valid(&v32.it_value) || + !timeval_valid(&v32.it_interval)) + return -EINVAL; + o->it_interval.tv_sec = v32.it_interval.tv_sec; - o->it_interval.tv_usec = v32.it_interval.tv_usec; + o->it_interval.tv_nsec = v32.it_interval.tv_usec * NSEC_PER_USEC; o->it_value.tv_sec = v32.it_value.tv_sec; - o->it_value.tv_usec = v32.it_value.tv_usec; + o->it_value.tv_nsec = v32.it_value.tv_usec * NSEC_PER_USEC; return 0; } COMPAT_SYSCALL_DEFINE3(setitimer, int, which, - struct old_itimerval32 __user *, in, - struct old_itimerval32 __user *, out) + struct old_itimerval32 __user *, value, + struct old_itimerval32 __user *, ovalue) { - struct itimerval kin, kout; + struct itimerspec64 set_buffer, get_buffer; int error; - if (in) { - if (get_old_itimerval32(&kin, in)) - return -EFAULT; + if (value) { + error = get_old_itimerval32(&set_buffer, value); + if (error) + return error; } else { - memset(&kin, 0, sizeof(kin)); + memset(&set_buffer, 0, sizeof(set_buffer)); + printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer." + " Misfeature support will be removed\n", + current->comm); } - error = do_setitimer(which, &kin, out ? &kout : NULL); - if (error || !out) + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) return error; - if (put_old_itimerval32(out, &kout)) + if (put_old_itimerval32(ovalue, &get_buffer)) return -EFAULT; return 0; } -- cgit v1.2.3 From 942437c97fd9ff23a17c13118f50bd0490f6868c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 15 Jul 2019 11:46:10 +0200 Subject: y2038: allow disabling time32 system calls At the moment, the compilation of the old time32 system calls depends purely on the architecture. As systems with new libc based on 64-bit time_t are getting deployed, even architectures that previously supported these (notably x86-32 and arm32 but also many others) no longer depend on them, and removing them from a kernel image results in a smaller kernel binary, the same way we can leave out many other optional system calls. More importantly, on an embedded system that needs to keep working beyond year 2038, any user space program calling these system calls is likely a bug, so removing them from the kernel image does provide an extra debugging help for finding broken applications. I've gone back and forth on hiding this option unless CONFIG_EXPERT is set. This version leaves it visible based on the logic that eventually it will be turned off indefinitely. Acked-by: Christian Brauner Signed-off-by: Arnd Bergmann --- kernel/sys_ni.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 34b76895b81e..3b69a560a7ac 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -410,6 +410,29 @@ COND_SYSCALL(send); COND_SYSCALL(bdflush); COND_SYSCALL(uselib); +/* optional: time32 */ +COND_SYSCALL(time32); +COND_SYSCALL(stime32); +COND_SYSCALL(utime32); +COND_SYSCALL(adjtimex_time32); +COND_SYSCALL(sched_rr_get_interval_time32); +COND_SYSCALL(nanosleep_time32); +COND_SYSCALL(rt_sigtimedwait_time32); +COND_SYSCALL_COMPAT(rt_sigtimedwait_time32); +COND_SYSCALL(timer_settime32); +COND_SYSCALL(timer_gettime32); +COND_SYSCALL(clock_settime32); +COND_SYSCALL(clock_gettime32); +COND_SYSCALL(clock_getres_time32); +COND_SYSCALL(clock_nanosleep_time32); +COND_SYSCALL(utimes_time32); +COND_SYSCALL(futimesat_time32); +COND_SYSCALL(pselect6_time32); +COND_SYSCALL_COMPAT(pselect6_time32); +COND_SYSCALL(ppoll_time32); +COND_SYSCALL_COMPAT(ppoll_time32); +COND_SYSCALL(utimensat_time32); +COND_SYSCALL(clock_adjtime32); /* * The syscalls below are not found in include/uapi/asm-generic/unistd.h -- cgit v1.2.3 From 58a74a2925a5b4125dd4f4e728490b9642534c81 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 15 Nov 2019 11:17:30 +0200 Subject: tracing: Increase SYNTH_FIELDS_MAX for synthetic_events Increase the maximum allowed count of synthetic event fields from 16 to 32 in order to allow for larger-than-usual events. Link: http://lkml.kernel.org/r/20191115091730.9192-1-dedekind1@gmail.com Reviewed-by: Tom Zanussi Signed-off-by: Artem Bityutskiy Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7482a1466ebf..f49d1a36d3ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -23,7 +23,7 @@ #include "trace_dynevent.h" #define SYNTH_SYSTEM "synthetic" -#define SYNTH_FIELDS_MAX 16 +#define SYNTH_FIELDS_MAX 32 #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ -- cgit v1.2.3 From 1c7f9b673dc0a15753274c4e7f5ebfd4468fc69f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:13:20 -0500 Subject: ftrace: Fix accounting bug with direct->count in register_ftrace_direct() The direct->count wasn't being updated properly, where it only was updated when the first entry was added, but should be updated every time. Fixes: 013bf0da04748 ("ftrace: Add ftrace_find_direct_func()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 834f3556ea1e..32e4e5ffdd97 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5093,8 +5093,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) ftrace_direct_func_count--; } } else { - if (!direct->count) - direct->count++; + direct->count++; } out_unlock: mutex_unlock(&direct_mutex); -- cgit v1.2.3 From 406acdd32d3e7d5a6dcb7f67798e89068fbe0d77 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:19:04 -0500 Subject: ftrace: Add another check for match in register_ftrace_direct() As an instruction pointer passed into register_ftrace_direct() may just exist on the ftrace call site, but may not be the start of the call site itself, register_ftrace_direct() still needs to update test if a direct call exists on the normalized site, as only one direct call is allowed at any one time. Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32e4e5ffdd97..9fe33ebaf914 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5030,7 +5030,12 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) goto out_unlock; /* Make sure the ip points to the exact record */ - ip = rec->ip; + if (ip != rec->ip) { + ip = rec->ip; + /* Need to check this ip for a direct. */ + if (find_rec_direct(ip)) + goto out_unlock; + } ret = -ENOMEM; if (ftrace_hash_empty(direct_functions) || -- cgit v1.2.3 From 128161f47bc3797b0d068da13e311770685d6e4f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Nov 2019 14:14:45 -0500 Subject: ftrace: Add helper find_direct_entry() to consolidate code Both unregister_ftrace_direct() and modify_ftrace_direct() needs to normalize the ip passed in to match the rec->ip, as it is acceptable to have the ip on the ftrace call site but not the start. There are also common validity checks with the record found by the ip, these should be done for both unregister_ftrace_direct() and modify_ftrace_direct(). Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 59 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9fe33ebaf914..ef79c8393f53 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5112,30 +5112,40 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) { struct ftrace_func_entry *entry; - struct ftrace_direct_func *direct; struct dyn_ftrace *rec; - int ret = -ENODEV; - mutex_lock(&direct_mutex); + rec = lookup_rec(*ip, *ip); + if (!rec) + return NULL; - entry = __ftrace_lookup_ip(direct_functions, ip); + entry = __ftrace_lookup_ip(direct_functions, rec->ip); if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; + WARN_ON(rec->flags & FTRACE_FL_DIRECT); + return NULL; + } - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) { - WARN_ON(rec->flags & FTRACE_FL_DIRECT); - goto out_unlock; - } + WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + /* Passed in ip just needs to be on the call site */ + *ip = rec->ip; + + return entry; +} + +int unregister_ftrace_direct(unsigned long ip, unsigned long addr) +{ + struct ftrace_direct_func *direct; + struct ftrace_func_entry *entry; + int ret = -ENODEV; + + mutex_lock(&direct_mutex); + + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; if (direct_functions->count == 1) unregister_ftrace_function(&direct_ops); @@ -5187,24 +5197,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; - struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = __ftrace_lookup_ip(direct_functions, ip); - if (!entry) { - /* OK if it is off by a little */ - rec = lookup_rec(ip, ip); - if (!rec || rec->ip == ip) - goto out_unlock; - - entry = __ftrace_lookup_ip(direct_functions, rec->ip); - if (!entry) - goto out_unlock; - ip = rec->ip; - WARN_ON(!(rec->flags & FTRACE_FL_DIRECT)); - } + entry = find_direct_entry(&ip); + if (!entry) + goto out_unlock; ret = -EINVAL; if (entry->direct != old_addr) -- cgit v1.2.3 From 4a169a95d885fe5c050bac1a21d43c86ba955bcf Mon Sep 17 00:00:00 2001 From: Maulik Shah Date: Fri, 15 Nov 2019 15:11:49 -0700 Subject: genirq: Introduce irq_chip_get/set_parent_state calls On certain QTI chipsets some GPIOs are direct-connect interrupts to the GIC to be used as regular interrupt lines. When the GPIOs are not used for interrupt generation the interrupt line is disabled. But disabling the interrupt at GIC does not prevent the interrupt to be reported as pending at GIC_ISPEND. Later, when drivers call enable_irq() on the interrupt, an unwanted interrupt occurs. Introduce get and set methods for irqchip's parent to clear it's pending irq state. This then can be invoked by the GPIO interrupt controller on the parents in it hierarchy to clear the interrupt before enabling the interrupt. Signed-off-by: Maulik Shah Signed-off-by: Lina Iyer Signed-off-by: Marc Zyngier Reviewed-by: Stephen Boyd Link: https://lore.kernel.org/r/1573855915-9841-7-git-send-email-ilina@codeaurora.org [updated commit text and minor code fixes] --- kernel/irq/chip.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b76703b2c0af..b3fa2d87d2f3 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1297,6 +1297,50 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_mask_irq); #endif /* CONFIG_IRQ_FASTEOI_HIERARCHY_HANDLERS */ +/** + * irq_chip_set_parent_state - set the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: State to be restored (one of IRQCHIP_STATE_*) + * @val: Value corresponding to @which + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_set_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool val) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_set_irqchip_state) + return 0; + + return data->chip->irq_set_irqchip_state(data, which, val); +} +EXPORT_SYMBOL_GPL(irq_chip_set_parent_state); + +/** + * irq_chip_get_parent_state - get the state of a parent interrupt. + * + * @data: Pointer to interrupt specific data + * @which: one of IRQCHIP_STATE_* the caller wants to know + * @state: a pointer to a boolean where the state is to be stored + * + * Conditional success, if the underlying irqchip does not implement it. + */ +int irq_chip_get_parent_state(struct irq_data *data, + enum irqchip_irq_state which, + bool *state) +{ + data = data->parent_data; + + if (!data || !data->chip->irq_get_irqchip_state) + return 0; + + return data->chip->irq_get_irqchip_state(data, which, state); +} +EXPORT_SYMBOL_GPL(irq_chip_get_parent_state); + /** * irq_chip_enable_parent - Enable the parent interrupt (defaults to unmask if * NULL) -- cgit v1.2.3 From ea806eb3eab35528b578a061b2c4b28f0f92c465 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 17 Nov 2019 17:04:15 -0500 Subject: ftrace: Add a helper function to modify_ftrace_direct() to allow arch optimization If a direct ftrace callback is at a location that does not have any other ftrace helpers attached to it, it is possible to simply just change the text to call the new caller (if the architecture supports it). But this requires special architecture code. Currently, modify_ftrace_direct() uses a trick to add a stub ftrace callback to the location forcing it to call the ftrace iterator. Then it can change the direct helper to call the new function in C, and then remove the stub. Removing the stub will have the location now call the new location that the direct helper is using. The new helper function does the registering the stub trick, but is a weak function, allowing an architecture to override it to do something a bit more direct. Link: https://lore.kernel.org/r/20191115215125.mbqv7taqnx376yed@ast-mbp.dhcp.thefacebook.com Suggested-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 120 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 88 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ef79c8393f53..caae523f4ef3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1020,12 +1020,6 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -struct ftrace_func_entry { - struct hlist_node hlist; - unsigned long ip; - unsigned long direct; /* for direct lookup only */ -}; - struct ftrace_func_probe { struct ftrace_probe_ops *probe_ops; struct ftrace_ops ops; @@ -5112,7 +5106,8 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } EXPORT_SYMBOL_GPL(register_ftrace_direct); -static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) +static struct ftrace_func_entry *find_direct_entry(unsigned long *ip, + struct dyn_ftrace **recp) { struct ftrace_func_entry *entry; struct dyn_ftrace *rec; @@ -5132,6 +5127,9 @@ static struct ftrace_func_entry *find_direct_entry(unsigned long *ip) /* Passed in ip just needs to be on the call site */ *ip = rec->ip; + if (recp) + *recp = rec; + return entry; } @@ -5143,7 +5141,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + entry = find_direct_entry(&ip, NULL); if (!entry) goto out_unlock; @@ -5179,6 +5177,75 @@ static struct ftrace_ops stub_ops = { .func = ftrace_stub, }; +/** + * ftrace_modify_direct_caller - modify ftrace nop directly + * @entry: The ftrace hash entry of the direct helper for @rec + * @rec: The record representing the function site to patch + * @old_addr: The location that the site at @rec->ip currently calls + * @new_addr: The location that the site at @rec->ip should call + * + * An architecture may overwrite this function to optimize the + * changing of the direct callback on an ftrace nop location. + * This is called with the ftrace_lock mutex held, and no other + * ftrace callbacks are on the associated record (@rec). Thus, + * it is safe to modify the ftrace record, where it should be + * currently calling @old_addr directly, to call @new_addr. + * + * Safety checks should be made to make sure that the code at + * @rec->ip is currently calling @old_addr. And this must + * also update entry->direct to @new_addr. + */ +int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, + struct dyn_ftrace *rec, + unsigned long old_addr, + unsigned long new_addr) +{ + unsigned long ip = rec->ip; + int ret; + + /* + * The ftrace_lock was used to determine if the record + * had more than one registered user to it. If it did, + * we needed to prevent that from changing to do the quick + * switch. But if it did not (only a direct caller was attached) + * then this function is called. But this function can deal + * with attached callers to the rec that we care about, and + * since this function uses standard ftrace calls that take + * the ftrace_lock mutex, we need to release it. + */ + mutex_unlock(&ftrace_lock); + + /* + * By setting a stub function at the same address, we force + * the code to call the iterator and the direct_ops helper. + * This means that @ip does not call the direct call, and + * we can simply modify it. + */ + ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); + if (ret) + goto out_lock; + + ret = register_ftrace_function(&stub_ops); + if (ret) { + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + goto out_lock; + } + + entry->direct = new_addr; + + /* + * By removing the stub, we put back the direct call, calling + * the @new_addr. + */ + unregister_ftrace_function(&stub_ops); + ftrace_set_filter_ip(&stub_ops, ip, 1, 0); + + out_lock: + mutex_lock(&ftrace_lock); + + return ret; +} + /** * modify_ftrace_direct - Modify an existing direct call to call something else * @ip: The instruction pointer to modify @@ -5197,11 +5264,13 @@ int modify_ftrace_direct(unsigned long ip, unsigned long old_addr, unsigned long new_addr) { struct ftrace_func_entry *entry; + struct dyn_ftrace *rec; int ret = -ENODEV; mutex_lock(&direct_mutex); - entry = find_direct_entry(&ip); + mutex_lock(&ftrace_lock); + entry = find_direct_entry(&ip, &rec); if (!entry) goto out_unlock; @@ -5210,33 +5279,20 @@ int modify_ftrace_direct(unsigned long ip, goto out_unlock; /* - * By setting a stub function at the same address, we force - * the code to call the iterator and the direct_ops helper. - * This means that @ip does not call the direct call, and - * we can simply modify it. + * If there's no other ftrace callback on the rec->ip location, + * then it can be changed directly by the architecture. + * If there is another caller, then we just need to change the + * direct caller helper to point to @new_addr. */ - ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0); - if (ret) - goto out_unlock; - - ret = register_ftrace_function(&stub_ops); - if (ret) { - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - goto out_unlock; + if (ftrace_rec_count(rec) == 1) { + ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr); + } else { + entry->direct = new_addr; + ret = 0; } - entry->direct = new_addr; - - /* - * By removing the stub, we put back the direct call, calling - * the @new_addr. - */ - unregister_ftrace_function(&stub_ops); - ftrace_set_filter_ip(&stub_ops, ip, 1, 0); - - ret = 0; - out_unlock: + mutex_unlock(&ftrace_lock); mutex_unlock(&direct_mutex); return ret; } -- cgit v1.2.3 From 46f9469247c6f4697cbbf37e4b3961120bf07f29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 18 Nov 2019 10:41:29 -0500 Subject: ftrace: Rename ftrace_graph_stub to ftrace_stub_graph The ftrace_graph_stub was created and points to ftrace_stub as a way to assign the functon graph tracer function pointer to a stub function with a different prototype than what ftrace_stub has and not trigger the C verifier. The ftrace_graph_stub was created via the linker script vmlinux.lds.h. Unfortunately, powerpc already uses the name ftrace_graph_stub for its internal implementation of the function graph tracer, and even though powerpc would still build, the change via the linker script broke function tracer on powerpc from working. By using the name ftrace_stub_graph, which does not exist anywhere else in the kernel, this should not be a problem. Link: https://lore.kernel.org/r/1573849732.5937.136.camel@lca.pw Fixes: b83b43ffc6e4 ("fgraph: Fix function type mismatches of ftrace_graph_return using ftrace_stub") Reorted-by: Qian Cai Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/fgraph.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index fa3ce10d0405..67e0c462b059 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -336,10 +336,10 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) * Simply points to ftrace_stub, but with the proper protocol. * Defined by the linker script in linux/vmlinux.lds.h */ -extern void ftrace_graph_stub(struct ftrace_graph_ret *); +extern void ftrace_stub_graph(struct ftrace_graph_ret *); /* The callbacks that hook a function */ -trace_func_graph_ret_t ftrace_graph_return = ftrace_graph_stub; +trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph; trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; @@ -619,7 +619,7 @@ void unregister_ftrace_graph(struct fgraph_ops *gops) goto out; ftrace_graph_active--; - ftrace_graph_return = ftrace_graph_stub; + ftrace_graph_return = ftrace_stub_graph; ftrace_graph_entry = ftrace_graph_entry_stub; __ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET); -- cgit v1.2.3 From c55b51a06b01d67a99457bb82a8c31081c7faa23 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:12 +0100 Subject: cpuidle: Allow idle injection to apply exit latency limit In some cases it may be useful to specify an exit latency limit for the idle state to be used during CPU idle time injection. Instead of duplicating the information in struct cpuidle_device or propagating the latency limit in the call stack, replace the use_deepest_state field with forced_latency_limit_ns to represent that limit, so that the deepest idle state with exit latency within that limit is forced (i.e. no governors) when it is set. A zero exit latency limit for forced idle means to use governors in the usual way (analogous to use_deepest_state equal to "false" before this change). Additionally, add play_idle_precise() taking two arguments, the duration of forced idle and the idle state exit latency limit, both in nanoseconds, and redefine play_idle() as a wrapper around that new function. This change is preparatory, no functional impact is expected. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Subject, changelog, cpuidle_use_deepest_state() kerneldoc, whitespace ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1aa260702b38..cd05ffa0abfe 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -165,7 +165,7 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ - if (idle_should_enter_s2idle() || dev->use_deepest_state) { + if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -311,7 +311,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -void play_idle(unsigned long duration_us) +void play_idle_precise(u64 duration_ns, u64 latency_ns) { struct idle_timer it; @@ -323,29 +323,29 @@ void play_idle(unsigned long duration_us) WARN_ON_ONCE(current->nr_cpus_allowed != 1); WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); - WARN_ON_ONCE(!duration_us); + WARN_ON_ONCE(!duration_ns); rcu_sleep_check(); preempt_disable(); current->flags |= PF_IDLE; - cpuidle_use_deepest_state(true); + cpuidle_use_deepest_state(latency_ns); it.done = 0; hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); it.timer.function = idle_inject_timer_fn; - hrtimer_start(&it.timer, ns_to_ktime(duration_us * NSEC_PER_USEC), + hrtimer_start(&it.timer, ns_to_ktime(duration_ns), HRTIMER_MODE_REL_PINNED); while (!READ_ONCE(it.done)) do_idle(); - cpuidle_use_deepest_state(false); + cpuidle_use_deepest_state(0); current->flags &= ~PF_IDLE; preempt_fold_need_resched(); preempt_enable(); } -EXPORT_SYMBOL_GPL(play_idle); +EXPORT_SYMBOL_GPL(play_idle_precise); void cpu_startup_entry(enum cpuhp_state state) { -- cgit v1.2.3 From 5aa9ba6312e36c18626e73506b92d1513d815435 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sat, 16 Nov 2019 14:16:13 +0100 Subject: cpuidle: Pass exit latency limit to cpuidle_use_deepest_state() Modify cpuidle_use_deepest_state() to take an additional exit latency limit argument to be passed to find_deepest_idle_state() and make cpuidle_idle_call() pass dev->forced_idle_latency_limit_ns to it for forced idle. Suggested-by: Rafael J. Wysocki Signed-off-by: Daniel Lezcano [ rjw: Rebase and rearrange code, subject & changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cd05ffa0abfe..fc9604ddd802 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -166,6 +166,8 @@ static void cpuidle_idle_call(void) */ if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) { + u64 max_latency_ns; + if (idle_should_enter_s2idle()) { rcu_idle_enter(); @@ -176,12 +178,16 @@ static void cpuidle_idle_call(void) } rcu_idle_exit(); + + max_latency_ns = U64_MAX; + } else { + max_latency_ns = dev->forced_idle_latency_limit_ns; } tick_nohz_idle_stop_tick(); rcu_idle_enter(); - next_state = cpuidle_find_deepest_state(drv, dev); + next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns); call_cpuidle(drv, dev, next_state); } else { bool stop_tick = true; -- cgit v1.2.3 From 56e35f9c5b87ec1ae93e483284e189c84388de16 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Nov 2019 18:03:11 +0100 Subject: dma-mapping: drop the dev argument to arch_sync_dma_for_* These are pure cache maintainance routines, so drop the unused struct device argument. Signed-off-by: Christoph Hellwig Suggested-by: Daniel Vetter --- kernel/dma/direct.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 22a2e0833862..077876ae5c74 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -268,7 +268,7 @@ void dma_direct_sync_single_for_device(struct device *dev, swiotlb_tbl_sync_single(dev, paddr, size, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, paddr, size, dir); + arch_sync_dma_for_device(paddr, size, dir); } EXPORT_SYMBOL(dma_direct_sync_single_for_device); @@ -286,7 +286,7 @@ void dma_direct_sync_sg_for_device(struct device *dev, dir, SYNC_FOR_DEVICE); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_device(dev, paddr, sg->length, + arch_sync_dma_for_device(paddr, sg->length, dir); } } @@ -302,8 +302,8 @@ void dma_direct_sync_single_for_cpu(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, addr); if (!dev_is_dma_coherent(dev)) { - arch_sync_dma_for_cpu(dev, paddr, size, dir); - arch_sync_dma_for_cpu_all(dev); + arch_sync_dma_for_cpu(paddr, size, dir); + arch_sync_dma_for_cpu_all(); } if (unlikely(is_swiotlb_buffer(paddr))) @@ -321,7 +321,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg)); if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu(dev, paddr, sg->length, dir); + arch_sync_dma_for_cpu(paddr, sg->length, dir); if (unlikely(is_swiotlb_buffer(paddr))) swiotlb_tbl_sync_single(dev, paddr, sg->length, dir, @@ -329,7 +329,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev, } if (!dev_is_dma_coherent(dev)) - arch_sync_dma_for_cpu_all(dev); + arch_sync_dma_for_cpu_all(); } EXPORT_SYMBOL(dma_direct_sync_sg_for_cpu); @@ -380,7 +380,7 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, } if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_device(dev, phys, size, dir); + arch_sync_dma_for_device(phys, size, dir); return dma_addr; } EXPORT_SYMBOL(dma_direct_map_page); -- cgit v1.2.3 From 50f579a2399dee0ad1c86ea8159ab8657b74f95b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Nov 2019 09:18:19 +0300 Subject: dma-debug: clean up put_hash_bucket() The put_hash_bucket() is a bit cleaner if it takes an unsigned long directly instead of a pointer to unsigned long. Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- kernel/dma/debug.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 004496654aaa..64972aa9bfc3 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -255,12 +255,10 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, * Give up exclusive access to the hash bucket */ static void put_hash_bucket(struct hash_bucket *bucket, - unsigned long *flags) + unsigned long flags) __releases(&bucket->lock) { - unsigned long __flags = *flags; - - spin_unlock_irqrestore(&bucket->lock, __flags); + spin_unlock_irqrestore(&bucket->lock, flags); } static bool exact_match(struct dma_debug_entry *a, struct dma_debug_entry *b) @@ -359,7 +357,7 @@ static struct dma_debug_entry *bucket_find_contain(struct hash_bucket **bucket, /* * Nothing found, go back a hash bucket */ - put_hash_bucket(*bucket, flags); + put_hash_bucket(*bucket, *flags); range += (1 << HASH_FN_SHIFT); index.dev_addr -= (1 << HASH_FN_SHIFT); *bucket = get_hash_bucket(&index, flags); @@ -609,7 +607,7 @@ static void add_dma_entry(struct dma_debug_entry *entry) bucket = get_hash_bucket(entry, &flags); hash_bucket_add(bucket, entry); - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); rc = active_cacheline_insert(entry); if (rc == -ENOMEM) { @@ -1002,7 +1000,7 @@ static void check_unmap(struct dma_debug_entry *ref) if (!entry) { /* must drop lock before calling dma_mapping_error */ - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); if (dma_mapping_error(ref->dev, ref->dev_addr)) { err_printk(ref->dev, NULL, @@ -1084,7 +1082,7 @@ static void check_unmap(struct dma_debug_entry *ref) hash_bucket_del(entry); dma_entry_free(entry); - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } static void check_for_stack(struct device *dev, @@ -1204,7 +1202,7 @@ static void check_sync(struct device *dev, } out: - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } static void check_sg_segment(struct device *dev, struct scatterlist *sg) @@ -1319,7 +1317,7 @@ void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) } } - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); } EXPORT_SYMBOL(debug_dma_mapping_error); @@ -1392,7 +1390,7 @@ static int get_nr_mapped_entries(struct device *dev, if (entry) mapped_ents = entry->sg_mapped_ents; - put_hash_bucket(bucket, &flags); + put_hash_bucket(bucket, flags); return mapped_ents; } -- cgit v1.2.3 From 4268ac6ae5870af10a7417b22990d615f72f77e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2019 17:35:36 +0100 Subject: dma-direct: don't check swiotlb=force in dma_direct_map_resource When mapping resources we can't just use swiotlb ram for bounce buffering. Switch to a direct dma_capable check instead. Fixes: cfced786969c ("dma-mapping: remove the default map_resource implementation") Reported-by: Robin Murphy Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski --- kernel/dma/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 077876ae5c74..a479bd2d1e8b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, { dma_addr_t dma_addr = paddr; - if (unlikely(!dma_direct_possible(dev, dma_addr, size))) { + if (unlikely(!dma_capable(dev, dma_addr, size))) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } -- cgit v1.2.3 From 68a33b1794665ba8a1d1ef1d3bfcc7c587d380a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Nov 2019 17:38:58 +0100 Subject: dma-direct: exclude dma_direct_map_resource from the min_low_pfn check The valid memory address check in dma_capable only makes sense when mapping normal memory, not when using dma_map_resource to map a device resource. Add a new boolean argument to dma_capable to exclude that check for the dma_map_resource case. Fixes: b12d66278dd6 ("dma-direct: check for overflows on 32 bit DMA addresses") Reported-by: Marek Szyprowski Signed-off-by: Christoph Hellwig Acked-by: Marek Szyprowski Tested-by: Marek Szyprowski --- kernel/dma/direct.c | 4 ++-- kernel/dma/swiotlb.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a479bd2d1e8b..40f1f0aac4b1 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -363,7 +363,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, size_t size) { return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size); + dma_capable(dev, dma_addr, size, true); } dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, @@ -412,7 +412,7 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, { dma_addr_t dma_addr = paddr; - if (unlikely(!dma_capable(dev, dma_addr, size))) { + if (unlikely(!dma_capable(dev, dma_addr, size, false))) { report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 673a2cdb2656..9280d6f8271e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -678,7 +678,7 @@ bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, /* Ensure that the address returned is DMA'ble */ *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size))) { + if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); return false; -- cgit v1.2.3 From 7b8474466ed97be458c825f34a85f2c2b84c3f95 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Thu, 21 Nov 2019 00:03:03 +0000 Subject: time: Zero the upper 32-bits in __kernel_timespec on 32-bit On compat interfaces, the high order bits of nanoseconds should be zeroed out. This is because the application code or the libc do not guarantee zeroing of these. If used without zeroing, kernel might be at risk of using timespec values incorrectly. Originally it was handled correctly, but lost during is_compat_syscall() cleanup. Revert the condition back to check CONFIG_64BIT. Fixes: 98f76206b335 ("compat: Cleanup in_compat_syscall() callers") Reported-by: Ben Hutchings Signed-off-by: Dmitry Safonov Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20191121000303.126523-1-dima@arista.com --- kernel/time/time.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/time.c b/kernel/time/time.c index 5c54ca632d08..83f403e7a15c 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -881,7 +881,8 @@ int get_timespec64(struct timespec64 *ts, ts->tv_sec = kts.tv_sec; /* Zero out the padding for 32 bit systems or in compat mode */ - if (IS_ENABLED(CONFIG_64BIT_TIME) && in_compat_syscall()) + if (IS_ENABLED(CONFIG_64BIT_TIME) && (!IS_ENABLED(CONFIG_64BIT) || + in_compat_syscall())) kts.tv_nsec &= 0xFFFFFFFFUL; ts->tv_nsec = kts.tv_nsec; -- cgit v1.2.3 From a7ba70f1787f977f970cd116076c6fce4b9e01cc Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Thu, 21 Nov 2019 10:26:44 +0100 Subject: dma-mapping: treat dev->bus_dma_mask as a DMA limit Using a mask to represent bus DMA constraints has a set of limitations. The biggest one being it can only hold a power of two (minus one). The DMA mapping code is already aware of this and treats dev->bus_dma_mask as a limit. This quirk is already used by some architectures although still rare. With the introduction of the Raspberry Pi 4 we've found a new contender for the use of bus DMA limits, as its PCIe bus can only address the lower 3GB of memory (of a total of 4GB). This is impossible to represent with a mask. To make things worse the device-tree code rounds non power of two bus DMA limits to the next power of two, which is unacceptable in this case. In the light of this, rename dev->bus_dma_mask to dev->bus_dma_limit all over the tree and treat it as such. Note that dev->bus_dma_limit should contain the higher accessible DMA address. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 267b23a13b69..6af7ae83c4ad 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -27,10 +27,10 @@ static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) { if (!dev->dma_mask) { dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { + } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus mask %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_mask); + "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); } WARN_ON_ONCE(1); } @@ -57,15 +57,14 @@ u64 dma_direct_get_required_mask(struct device *dev) } static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, - u64 *phys_mask) + u64 *phys_limit) { - if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) - dma_mask = dev->bus_dma_mask; + u64 dma_limit = min_not_zero(dma_mask, dev->bus_dma_limit); if (force_dma_unencrypted(dev)) - *phys_mask = __dma_to_phys(dev, dma_mask); + *phys_limit = __dma_to_phys(dev, dma_limit); else - *phys_mask = dma_to_phys(dev, dma_mask); + *phys_limit = dma_to_phys(dev, dma_limit); /* * Optimistically try the zone that the physical address mask falls @@ -75,9 +74,9 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, * Note that GFP_DMA32 and GFP_DMA are no ops without the corresponding * zones. */ - if (*phys_mask <= DMA_BIT_MASK(zone_dma_bits)) + if (*phys_limit <= DMA_BIT_MASK(zone_dma_bits)) return GFP_DMA; - if (*phys_mask <= DMA_BIT_MASK(32)) + if (*phys_limit <= DMA_BIT_MASK(32)) return GFP_DMA32; return 0; } @@ -85,7 +84,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= - min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, @@ -94,7 +93,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, size_t alloc_size = PAGE_ALIGN(size); int node = dev_to_node(dev); struct page *page = NULL; - u64 phys_mask; + u64 phys_limit; if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; @@ -102,7 +101,7 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, - &phys_mask); + &phys_limit); page = dma_alloc_contiguous(dev, alloc_size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, alloc_size); @@ -116,7 +115,7 @@ again: page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - phys_mask < DMA_BIT_MASK(64) && + phys_limit < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; -- cgit v1.2.3 From 0e4a459f56c32d3e52ae69a4b447db2f48a65f44 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Wed, 20 Nov 2019 19:43:50 +0900 Subject: tracing: Remove unnecessary DEBUG_FS dependency Tracing replaced debugfs with tracefs. Signed-off-by: Kusanagi Kouichi Reviewed-by: Steven Rostedt (VMware) Link: https://lore.kernel.org/r/20191120104350753.EWCT.12796.ppp.dion.ne.jp@dmta0009.auone-net.jp Signed-off-by: Greg Kroah-Hartman --- kernel/trace/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e08527f50d2a..382628b9b759 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -106,7 +106,6 @@ config PREEMPTIRQ_TRACEPOINTS config TRACING bool - select DEBUG_FS select RING_BUFFER select STACKTRACE if STACKTRACE_SUPPORT select TRACEPOINTS -- cgit v1.2.3 From a82a4804b4ee3636c8988fea14d44f70f4de45f1 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 16 Nov 2019 10:05:55 -0500 Subject: ring-buffer: Fix typos in function ring_buffer_producer Fix spelling and other typos Link: http://lkml.kernel.org/r/1573916755-32478-1-git-send-email-xianting_tian@126.com Signed-off-by: Xianting Tian Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer_benchmark.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 09b0b49f346e..32149e46551c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -269,10 +269,10 @@ static void ring_buffer_producer(void) #ifndef CONFIG_PREEMPTION /* - * If we are a non preempt kernel, the 10 second run will + * If we are a non preempt kernel, the 10 seconds run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a - * rescedule. + * reschedule. * * Do a cond resched at the same frequency we would wake up * the reader. -- cgit v1.2.3 From fc809bc5ceaa665497384064ab2d76713c774bad Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:38:07 +0800 Subject: tracing: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Link: http://lkml.kernel.org/r/20191120133807.12741-1-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b872716bb2a0..f67620499faa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,7 +79,7 @@ config FTRACE_NMI_ENTER config EVENT_TRACING select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB bool config CONTEXT_SWITCH_TRACER @@ -311,7 +311,7 @@ config TRACER_SNAPSHOT cat snapshot config TRACER_SNAPSHOT_PER_CPU_SWAP - bool "Allow snapshot to swap per CPU" + bool "Allow snapshot to swap per CPU" depends on TRACER_SNAPSHOT select RING_BUFFER_ALLOW_SWAP help @@ -683,7 +683,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. config TRACEPOINT_BENCHMARK - bool "Add tracepoint that benchmarks tracepoints" + bool "Add tracepoint that benchmarks tracepoints" help This option creates the tracepoint "benchmark:benchmark_event". When the tracepoint is enabled, it kicks off a kernel thread that @@ -732,7 +732,7 @@ config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER help - Run a simple self test on the ring buffer on boot up. Late in the + Run a simple self test on the ring buffer on boot up. Late in the kernel boot sequence, the test will start that kicks off a thread per cpu. Each thread will write various size events into the ring buffer. Another thread is created to send IPIs -- cgit v1.2.3 From 28879787147358e8ffcae397f11748de3dd26577 Mon Sep 17 00:00:00 2001 From: Divya Indi Date: Wed, 20 Nov 2019 11:08:38 -0800 Subject: tracing: Adding new functions for kernel access to Ftrace instances Adding 2 new functions - 1) struct trace_array *trace_array_get_by_name(const char *name); Return pointer to a trace array with given name. If it does not exist, create and return pointer to the new trace array. 2) int trace_array_set_clr_event(struct trace_array *tr, const char *system ,const char *event, bool enable); Enable/Disable events to this trace array. Additionally, - To handle reference counters, export trace_array_put() - Due to introduction of the above 2 new functions, we no longer need to export - ftrace_set_clr_event & trace_array_create APIs. Link: http://lkml.kernel.org/r/1574276919-11119-2-git-send-email-divya.indi@oracle.com Signed-off-by: Divya Indi Reviewed-by: Aruna Ramakrishna Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 96 +++++++++++++++++++++++++++++++++++---------- kernel/trace/trace.h | 1 - kernel/trace/trace_events.c | 27 ++++++++++++- 3 files changed, 102 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42659ce6ac0c..02a23a6e5e00 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -301,12 +301,24 @@ static void __trace_array_put(struct trace_array *this_tr) this_tr->ref--; } +/** + * trace_array_put - Decrement the reference counter for this trace array. + * + * NOTE: Use this when we no longer need the trace array returned by + * trace_array_get_by_name(). This ensures the trace array can be later + * destroyed. + * + */ void trace_array_put(struct trace_array *this_tr) { + if (!this_tr) + return; + mutex_lock(&trace_types_lock); __trace_array_put(this_tr); mutex_unlock(&trace_types_lock); } +EXPORT_SYMBOL_GPL(trace_array_put); int tracing_check_open_get_tr(struct trace_array *tr) { @@ -8437,24 +8449,15 @@ static void update_tracer_options(struct trace_array *tr) mutex_unlock(&trace_types_lock); } -struct trace_array *trace_array_create(const char *name) +static struct trace_array *trace_array_create(const char *name) { struct trace_array *tr; int ret; - mutex_lock(&event_mutex); - mutex_lock(&trace_types_lock); - - ret = -EEXIST; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (tr->name && strcmp(tr->name, name) == 0) - goto out_unlock; - } - ret = -ENOMEM; tr = kzalloc(sizeof(*tr), GFP_KERNEL); if (!tr) - goto out_unlock; + return ERR_PTR(ret); tr->name = kstrdup(name, GFP_KERNEL); if (!tr->name) @@ -8499,8 +8502,8 @@ struct trace_array *trace_array_create(const char *name) list_add(&tr->list, &ftrace_trace_arrays); - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); + tr->ref++; + return tr; @@ -8510,24 +8513,77 @@ struct trace_array *trace_array_create(const char *name) kfree(tr->name); kfree(tr); - out_unlock: - mutex_unlock(&trace_types_lock); - mutex_unlock(&event_mutex); - return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(trace_array_create); static int instance_mkdir(const char *name) { - return PTR_ERR_OR_ZERO(trace_array_create(name)); + struct trace_array *tr; + int ret; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + ret = PTR_ERR_OR_ZERO(tr); + +out_unlock: + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return ret; +} + +/** + * trace_array_get_by_name - Create/Lookup a trace array, given its name. + * @name: The name of the trace array to be looked up/created. + * + * Returns pointer to trace array with given name. + * NULL, if it cannot be created. + * + * NOTE: This function increments the reference counter associated with the + * trace array returned. This makes sure it cannot be freed while in use. + * Use trace_array_put() once the trace array is no longer needed. + * + */ +struct trace_array *trace_array_get_by_name(const char *name) +{ + struct trace_array *tr; + + mutex_lock(&event_mutex); + mutex_lock(&trace_types_lock); + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + tr = trace_array_create(name); + + if (IS_ERR(tr)) + tr = NULL; +out_unlock: + if (tr) + tr->ref++; + + mutex_unlock(&trace_types_lock); + mutex_unlock(&event_mutex); + return tr; } +EXPORT_SYMBOL_GPL(trace_array_get_by_name); static int __remove_instance(struct trace_array *tr) { int i; - if (tr->ref || (tr->current_trace && tr->current_trace->ref)) + /* Reference counter for a newly created trace array = 1. */ + if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref)) return -EBUSY; list_del(&tr->list); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2df8aed6a8f0..ca7fccafbcbb 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -345,7 +345,6 @@ extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); -extern void trace_array_put(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2a3ac2365445..6b3a69e9aa6a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -827,7 +827,6 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) return ret; } -EXPORT_SYMBOL_GPL(ftrace_set_clr_event); /** * trace_set_clr_event - enable or disable an event @@ -852,6 +851,32 @@ int trace_set_clr_event(const char *system, const char *event, int set) } EXPORT_SYMBOL_GPL(trace_set_clr_event); +/** + * trace_array_set_clr_event - enable or disable an event for a trace array. + * @tr: concerned trace array. + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @enable: true to enable, false to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_array_set_clr_event(struct trace_array *tr, const char *system, + const char *event, bool enable) +{ + int set; + + if (!tr) + return -ENOENT; + + set = (enable == true) ? 1 : 0; + return __ftrace_set_clr_event(tr, NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_array_set_clr_event); + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.3 From 0e24220821b0e0e330a18bfef29ac6396545d62e Mon Sep 17 00:00:00 2001 From: Hassan Naveed Date: Fri, 15 Nov 2019 23:44:42 +0000 Subject: tracing: Use xarray for syscall trace events Currently, a lot of memory is wasted for architectures like MIPS when init_ftrace_syscalls() allocates the array for syscalls using kcalloc. This is because syscalls numbers start from 4000, 5000 or 6000 and array elements up to that point are unused. Fix this by using a data structure more suited to storing sparsely populated arrays. The XARRAY data structure, implemented using radix trees, is much more memory efficient for storing the syscalls in question. Link: http://lkml.kernel.org/r/20191115234314.21599-1-hnaveed@wavecomp.com Signed-off-by: Hassan Naveed Reviewed-by: Paul Burton Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_syscalls.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index fa8fbff736d6..16fa218556fa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -7,6 +7,7 @@ #include /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ #include #include +#include #include #include "trace_output.h" @@ -30,6 +31,7 @@ syscall_get_enter_fields(struct trace_event_call *call) extern struct syscall_metadata *__start_syscalls_metadata[]; extern struct syscall_metadata *__stop_syscalls_metadata[]; +static DEFINE_XARRAY(syscalls_metadata_sparse); static struct syscall_metadata **syscalls_metadata; #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME @@ -101,6 +103,9 @@ find_syscall_meta(unsigned long syscall) static struct syscall_metadata *syscall_nr_to_meta(int nr) { + if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) + return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) return NULL; @@ -536,12 +541,16 @@ void __init init_ftrace_syscalls(void) struct syscall_metadata *meta; unsigned long addr; int i; - - syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), - GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return; + void *ret; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata = kcalloc(NR_syscalls, + sizeof(*syscalls_metadata), + GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } } for (i = 0; i < NR_syscalls; i++) { @@ -551,7 +560,16 @@ void __init init_ftrace_syscalls(void) continue; meta->syscall_nr = i; - syscalls_metadata[i] = meta; + + if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { + syscalls_metadata[i] = meta; + } else { + ret = xa_store(&syscalls_metadata_sparse, i, meta, + GFP_KERNEL); + WARN(xa_is_err(ret), + "Syscall memory allocation failed\n"); + } + } } -- cgit v1.2.3 From 107e899874e95dcddc779142942bf285eba38bc5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Nov 2019 16:22:21 -0400 Subject: mm/hmm: define the pre-processor related parts of hmm.h even if disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the function calls are stubbed out with static inlines that always fail. This is the standard way to write a header for an optional component and makes it easier for drivers that only optionally need HMM_MIRROR. Link: https://lore.kernel.org/r/20191112202231.3856-5-jgg@ziepe.ca Reviewed-by: Jérôme Glisse Tested-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- kernel/fork.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..ca39cfc404e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From b111df8447acdeb4b9220f99d5d4b28f83eb56ad Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 25 Nov 2019 21:25:46 +0100 Subject: y2038: alarm: fix half-second cut-off Changing alarm_itimer accidentally broke the logic for arithmetic rounding of half seconds in the return code. Change it to a constant based on NSEC_PER_SEC, as suggested by Ben Hutchings. Fixes: bd40a175769d ("y2038: itimer: change implementation to timespec64") Reported-by: Ben Hutchings Signed-off-by: Arnd Bergmann --- kernel/time/itimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 5872db9bd5f7..9e59c9ea92aa 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -297,7 +297,7 @@ static unsigned int alarm_setitimer(unsigned int seconds) * better return too much than too little anyway */ if ((!it_old.it_value.tv_sec && it_old.it_value.tv_nsec) || - it_old.it_value.tv_nsec >= 500000) + it_old.it_value.tv_nsec >= (NSEC_PER_SEC / 2)) it_old.it_value.tv_sec++; return it_old.it_value.tv_sec; -- cgit v1.2.3 From 61a47c1ad3a4dc6882f01ebdc88138ac62d0df03 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 1 Oct 2019 13:01:19 -0500 Subject: sysctl: Remove the sysctl system call This system call has been deprecated almost since it was introduced, and in a survey of the linux distributions I can no longer find any of them that enable CONFIG_SYSCTL_SYSCALL. The only indication that I can find that anyone might care is that a few of the defconfigs in the kernel enable CONFIG_SYSCTL_SYSCALL. However this appears in only 31 of 414 defconfigs in the kernel, so I suspect this symbols presence is simply because it is harmless to include rather than because it is necessary. As there appear to be no users of the sysctl system call, remove the code. As this removes one of the few uses of the internal kernel mount of proc I hope this allows for even more simplifications of the proc filesystem. Cc: Alex Smith Cc: Anders Berg Cc: Apelete Seketeli Cc: Arnd Bergmann Cc: Chee Nouk Phoon Cc: Chris Zankel Cc: Christian Ruppert Cc: Greg Ungerer Cc: Harvey Hunt Cc: Helge Deller Cc: Hongliang Tao Cc: Hua Yan Cc: Huacai Chen Cc: John Crispin Cc: Jonas Jensen Cc: Josh Boyer Cc: Jun Nie Cc: Kevin Hilman Cc: Kevin Wells Cc: Kumar Gala Cc: Lars-Peter Clausen Cc: Ley Foon Tan Cc: Linus Walleij Cc: Markos Chandras Cc: Max Filippov Cc: Noam Camus Cc: Olof Johansson Cc: Paul Burton Cc: Paul Mundt Cc: Phil Edworthy Cc: Pierrick Hascoet Cc: Ralf Baechle Cc: Roland Stigge Cc: Santosh Shilimkar Cc: Scott Telford Cc: Stephen Boyd Cc: Steven J. Hill Cc: Tanmay Inamdar Cc: Vineet Gupta Cc: Wolfram Sang Acked-by: Andi Kleen Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- kernel/sysctl_binary.c | 1305 ------------------------------------------------ 1 file changed, 1305 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 73c132095a7b..7d550cc76a3b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -18,1317 +18,12 @@ #include #include -#ifdef CONFIG_SYSCTL_SYSCALL - -struct bin_table; -typedef ssize_t bin_convert_t(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); - -static bin_convert_t bin_dir; -static bin_convert_t bin_string; -static bin_convert_t bin_intvec; -static bin_convert_t bin_ulongvec; -static bin_convert_t bin_uuid; -static bin_convert_t bin_dn_node_address; - -#define CTL_DIR bin_dir -#define CTL_STR bin_string -#define CTL_INT bin_intvec -#define CTL_ULONG bin_ulongvec -#define CTL_UUID bin_uuid -#define CTL_DNADR bin_dn_node_address - -#define BUFSZ 256 - -struct bin_table { - bin_convert_t *convert; - int ctl_name; - const char *procname; - const struct bin_table *child; -}; - -static const struct bin_table bin_random_table[] = { - { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, - { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, - { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, - { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, - { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, - { CTL_UUID, RANDOM_UUID, "uuid" }, - {} -}; - -static const struct bin_table bin_pty_table[] = { - { CTL_INT, PTY_MAX, "max" }, - { CTL_INT, PTY_NR, "nr" }, - {} -}; - -static const struct bin_table bin_kern_table[] = { - { CTL_STR, KERN_OSTYPE, "ostype" }, - { CTL_STR, KERN_OSRELEASE, "osrelease" }, - /* KERN_OSREV not used */ - { CTL_STR, KERN_VERSION, "version" }, - /* KERN_SECUREMASK not used */ - /* KERN_PROF not used */ - { CTL_STR, KERN_NODENAME, "hostname" }, - { CTL_STR, KERN_DOMAINNAME, "domainname" }, - - { CTL_INT, KERN_PANIC, "panic" }, - { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, - - { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, - { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, - { CTL_INT, KERN_PRINTK, "printk" }, - - /* KERN_NAMETRANS not used */ - /* KERN_PPC_HTABRECLAIM not used */ - /* KERN_PPC_ZEROPAGED not used */ - { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, - - { CTL_STR, KERN_MODPROBE, "modprobe" }, - { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, - { CTL_INT, KERN_ACCT, "acct" }, - /* KERN_PPC_L2CR "l2cr" no longer used */ - - /* KERN_RTSIGNR not used */ - /* KERN_RTSIGMAX not used */ - - { CTL_ULONG, KERN_SHMMAX, "shmmax" }, - { CTL_INT, KERN_MSGMAX, "msgmax" }, - { CTL_INT, KERN_MSGMNB, "msgmnb" }, - /* KERN_MSGPOOL not used*/ - { CTL_INT, KERN_SYSRQ, "sysrq" }, - { CTL_INT, KERN_MAX_THREADS, "threads-max" }, - { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, - { CTL_ULONG, KERN_SHMALL, "shmall" }, - { CTL_INT, KERN_MSGMNI, "msgmni" }, - { CTL_INT, KERN_SEM, "sem" }, - { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, - { CTL_INT, KERN_SHMMNI, "shmmni" }, - - { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, - { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, - - { CTL_STR, KERN_HOTPLUG, "hotplug", }, - { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, - - { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, - { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, - /* KERN_TAINTED "tainted" no longer used */ - { CTL_INT, KERN_CADPID, "cad_pid" }, - { CTL_INT, KERN_PIDMAX, "pid_max" }, - { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, - { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, - { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, - { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, - - { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, - { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, - - { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, - { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, - { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, - /* KERN_HZ_TIMER "hz_timer" no longer used */ - { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, - { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, - { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, - - { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, - /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ - { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, - { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, - { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, - { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, - { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, - { CTL_ULONG, KERN_PANIC_PRINT, "panic_print" }, - {} -}; - -static const struct bin_table bin_vm_table[] = { - { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, - { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, - { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, - { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, - /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ - /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ - { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, - /* VM_PAGEBUF unused */ - /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ - { CTL_INT, VM_SWAPPINESS, "swappiness" }, - { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, - { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, - { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, - { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, - { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, - { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, - { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, - { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, - /* VM_SWAP_TOKEN_TIMEOUT unused */ - { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, - { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, - { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, - { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, - { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, - { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, - { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, - - {} -}; - -static const struct bin_table bin_net_core_table[] = { - { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, - { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, - { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, - { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, - /* NET_CORE_DESTROY_DELAY unused */ - { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, - /* NET_CORE_FASTROUTE unused */ - { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, - { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, - { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, - /* NET_CORE_HOT_LIST_LENGTH unused */ - /* NET_CORE_DIVERT_VERSION unused */ - /* NET_CORE_NO_CONG_THRESH unused */ - /* NET_CORE_NO_CONG unused */ - /* NET_CORE_LO_CONG unused */ - /* NET_CORE_MOD_CONG unused */ - { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, - { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, - { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, - { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, - { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, - { CTL_INT, NET_CORE_WARNINGS, "warnings" }, - {}, -}; - -static const struct bin_table bin_net_unix_table[] = { - /* NET_UNIX_DESTROY_DELAY unused */ - /* NET_UNIX_DELETE_DELAY unused */ - { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, - {} -}; - -static const struct bin_table bin_net_ipv4_route_table[] = { - { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, - /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ - /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, - { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, - { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, - { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, - { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_vars_table[] = { - { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, - - { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, - { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, - { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, - { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, - { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, - { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, - { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, - { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, - { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, - { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, - { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, - { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, - { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, - { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, - - { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, - { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, - { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, - { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, - {} -}; - -static const struct bin_table bin_net_ipv4_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, - { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, - {} -}; - -static const struct bin_table bin_net_neigh_vars_table[] = { - { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, - { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, - { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, - /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ - { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, - { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, - { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, - { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, - { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, - /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ - /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ - /* NET_NEIGH_LOCKTIME "locktime" no longer used */ - { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, - { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, - { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, - { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, - { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, - {} -}; - -static const struct bin_table bin_net_neigh_table[] = { - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, - { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, - {} -}; - -static const struct bin_table bin_net_ipv4_netfilter_table[] = { - { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, - - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ - - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ - /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ - /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, - /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, - - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - - { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, - { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, - {} -}; - -static const struct bin_table bin_net_ipv4_table[] = { - {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, - - { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, - { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, - /* NET_IPV4_FIB_HASH unused */ - { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, - - { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, - { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, - { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, - { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, - { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, - /* NET_IPV4_AUTOCONFIG unused */ - { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, - { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, - { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, - { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, - { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, - { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, - { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, - { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, - { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, - { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, - { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, - { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, - { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, - { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, - { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, - { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, - { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, - { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, - { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, - { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, - { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, - { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, - { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, - { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, - { CTL_INT, NET_TCP_FACK, "tcp_fack" }, - { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, - { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, - { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, - { CTL_INT, NET_TCP_MEM, "tcp_mem" }, - { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, - { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, - { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, - { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, - { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, - { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, - { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, - { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, - { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, - { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, - { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, - { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, - { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, - { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, - { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, - { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, - { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, - { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, - { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, - { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, - /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ - { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, - { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, - - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, - { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, - { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, - { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, - { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, - { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, - - { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, - { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, - - { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, - /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ - - { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, - - /* NET_TCP_DEFAULT_WIN_SCALE unused */ - /* NET_TCP_BIC_BETA unused */ - /* NET_IPV4_TCP_MAX_KA_PROBES unused */ - /* NET_IPV4_IP_MASQ_DEBUG unused */ - /* NET_TCP_SYN_TAILDROP unused */ - /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ - /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ - /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ - /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ - /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ - /* NET_IPV4_ALWAYS_DEFRAG unused */ - {} -}; - -static const struct bin_table bin_net_ipx_table[] = { - { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, - /* NET_IPX_FORWARDING unused */ - {} -}; - -static const struct bin_table bin_net_atalk_table[] = { - { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, - { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, - { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, - { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, - {}, -}; - -static const struct bin_table bin_net_netrom_table[] = { - { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, - { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, - { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, - { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, - { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, - { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, - { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, - { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, - { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, - { CTL_INT, NET_NETROM_RESET, "reset" }, - {} -}; - -static const struct bin_table bin_net_ax25_param_table[] = { - { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, - { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, - { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, - { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, - { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, - { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, - { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, - { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, - { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, - { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, - { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, - { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, - { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, - { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, - {} -}; - -static const struct bin_table bin_net_ax25_table[] = { - { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, - {} -}; - -static const struct bin_table bin_net_rose_table[] = { - { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, - { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, - { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, - { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, - { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, - { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_var_table[] = { - { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, - { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, - { CTL_INT, NET_IPV6_MTU, "mtu" }, - { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, - { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, - { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, - { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, - { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, - { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, - { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, - { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, - { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, - { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, - { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, - { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, - { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, - { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, - { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, - { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, - { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" }, - {} -}; - -static const struct bin_table bin_net_ipv6_conf_table[] = { - { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, - { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, - { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, - {} -}; - -static const struct bin_table bin_net_ipv6_route_table[] = { - /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ - { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, - { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, - { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, - { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, - { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, - { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, - {} -}; - -static const struct bin_table bin_net_ipv6_icmp_table[] = { - { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, - {} -}; - -static const struct bin_table bin_net_ipv6_table[] = { - { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, - { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, - { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, - { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, - { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, - { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, - { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, - { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, - { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, - { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, - {} -}; - -static const struct bin_table bin_net_x25_table[] = { - { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, - { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, - { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, - { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, - { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, - { CTL_INT, NET_X25_FORWARD, "x25_forward" }, - {} -}; - -static const struct bin_table bin_net_tr_table[] = { - { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, - {} -}; - - -static const struct bin_table bin_net_decnet_conf_vars[] = { - { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, - { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, - { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, - { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, - {} -}; - -static const struct bin_table bin_net_decnet_conf[] = { - { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, - { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, - { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, - {} -}; - -static const struct bin_table bin_net_decnet_table[] = { - { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, - { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, - { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, - { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, - { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, - { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, - { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, - { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, - { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, - { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, - { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, - { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, - { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, - { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, - {} -}; - -static const struct bin_table bin_net_sctp_table[] = { - { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, - { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, - { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, - { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, - { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, - { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, - { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, - { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, - { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, - { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, - { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, - { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, - { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, - { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, - { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, - { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, - { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_timeout_table[] = { - { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, - { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, - { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, - { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, - {} -}; - -static const struct bin_table bin_net_llc_station_table[] = { - { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, - {} -}; - -static const struct bin_table bin_net_llc_llc2_table[] = { - { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, - {} -}; - -static const struct bin_table bin_net_llc_table[] = { - { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, - { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, - {} -}; - -static const struct bin_table bin_net_netfilter_table[] = { - { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ - /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ - /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ - /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ - /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, - { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, - /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, - { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ - /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, - /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ - /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ - { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, - { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, - - {} -}; - -static const struct bin_table bin_net_table[] = { - { CTL_DIR, NET_CORE, "core", bin_net_core_table }, - /* NET_ETHER not used */ - /* NET_802 not used */ - { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, - { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, - { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, - { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, - { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, - { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, - /* NET_BRIDGE "bridge" no longer used */ - { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, - { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, - { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, - { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, - { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, - /* NET_ECONET not used */ - { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, - { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, - { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, - /* NET_DCCP "dccp" no longer used */ - /* NET_IRDA "irda" no longer used */ - { CTL_INT, 2089, "nf_conntrack_max" }, - {} -}; - -static const struct bin_table bin_fs_quota_table[] = { - { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, - { CTL_INT, FS_DQ_DROPS, "drops" }, - { CTL_INT, FS_DQ_READS, "reads" }, - { CTL_INT, FS_DQ_WRITES, "writes" }, - { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, - { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, - { CTL_INT, FS_DQ_FREE, "free_dquots" }, - { CTL_INT, FS_DQ_SYNCS, "syncs" }, - { CTL_INT, FS_DQ_WARNINGS, "warnings" }, - {} -}; - -static const struct bin_table bin_fs_xfs_table[] = { - { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, - { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, - { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, - - { CTL_INT, XFS_ERRLEVEL, "error_level" }, - { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, - { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, - { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, - { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, - { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, - { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, - { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, - { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, - { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, - { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, - { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_nm_table[] = { - { CTL_STR, 1, "hb_ctl_path" }, - {} -}; - -static const struct bin_table bin_fs_ocfs2_table[] = { - { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, - {} -}; - -static const struct bin_table bin_inotify_table[] = { - { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, - { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, - { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, - {} -}; - -static const struct bin_table bin_fs_table[] = { - { CTL_INT, FS_NRINODE, "inode-nr" }, - { CTL_INT, FS_STATINODE, "inode-state" }, - /* FS_MAXINODE unused */ - /* FS_NRDQUOT unused */ - /* FS_MAXDQUOT unused */ - /* FS_NRFILE "file-nr" no longer used */ - { CTL_INT, FS_MAXFILE, "file-max" }, - { CTL_INT, FS_DENTRY, "dentry-state" }, - /* FS_NRSUPER unused */ - /* FS_MAXUPSER unused */ - { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, - { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, - { CTL_INT, FS_LEASES, "leases-enable" }, - { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, - { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, - { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, - { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, - { CTL_ULONG, FS_AIO_NR, "aio-nr" }, - { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, - { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, - { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, - { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, - {} -}; - -static const struct bin_table bin_ipmi_table[] = { - { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, - {} -}; - -static const struct bin_table bin_mac_hid_files[] = { - /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ - /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, - { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, - /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ - {} -}; - -static const struct bin_table bin_raid_table[] = { - { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, - { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, - {} -}; - -static const struct bin_table bin_scsi_table[] = { - { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, - {} -}; - -static const struct bin_table bin_dev_table[] = { - /* DEV_CDROM "cdrom" no longer used */ - /* DEV_HWMON unused */ - /* DEV_PARPORT "parport" no longer used */ - { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, - { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, - { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, - { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, - {} -}; - -static const struct bin_table bin_bus_isa_table[] = { - { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, - { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, - { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, - {} -}; - -static const struct bin_table bin_bus_table[] = { - { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, - {} -}; - - -static const struct bin_table bin_s390dbf_table[] = { - { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, - { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, - {} -}; - -static const struct bin_table bin_sunrpc_table[] = { - /* CTL_RPCDEBUG "rpc_debug" no longer used */ - /* CTL_NFSDEBUG "nfs_debug" no longer used */ - /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ - /* CTL_NLMDEBUG "nlm_debug" no longer used */ - - { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, - { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, - { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, - { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, - {} -}; - -static const struct bin_table bin_pm_table[] = { - /* frv specific */ - /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ - { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, - { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, - { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, - {} -}; - -static const struct bin_table bin_root_table[] = { - { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, - { CTL_DIR, CTL_VM, "vm", bin_vm_table }, - { CTL_DIR, CTL_NET, "net", bin_net_table }, - /* CTL_PROC not used */ - { CTL_DIR, CTL_FS, "fs", bin_fs_table }, - /* CTL_DEBUG "debug" no longer used */ - { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, - { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, - { CTL_DIR, CTL_ABI, "abi" }, - /* CTL_CPU not used */ - /* CTL_ARLAN "arlan" no longer used */ - { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, - { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, - { CTL_DIR, CTL_PM, "pm", bin_pm_table }, - {} -}; - -static ssize_t bin_dir(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - return -ENOTDIR; -} - - -static ssize_t bin_string(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char __user *lastp; - loff_t pos = 0; - int ch; - - result = vfs_read(file, oldval, oldlen, &pos); - if (result < 0) - goto out; - - copied = result; - lastp = oldval + copied - 1; - - result = -EFAULT; - if (get_user(ch, lastp)) - goto out; - - /* Trim off the trailing newline */ - if (ch == '\n') { - result = -EFAULT; - if (put_user('\0', lastp)) - goto out; - copied -= 1; - } - } - - if (newval && newlen) { - loff_t pos = 0; - - result = vfs_write(file, newval, newlen, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static ssize_t bin_intvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - result = kernel_read(file, buffer, BUFSZ - 1, &pos); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned __user *vec = newval; - size_t length = newlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += scnprintf(str, end - str, "%lu\t", value); - } - - result = kernel_write(file, buffer, str - buffer, &pos); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_ulongvec(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t copied = 0; - char *buffer; - ssize_t result; - - result = -ENOMEM; - buffer = kmalloc(BUFSZ, GFP_KERNEL); - if (!buffer) - goto out; - - if (oldval && oldlen) { - unsigned long __user *vec = oldval; - size_t length = oldlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - result = kernel_read(file, buffer, BUFSZ - 1, &pos); - if (result < 0) - goto out_kfree; - - str = buffer; - end = str + result; - *end++ = '\0'; - for (i = 0; i < length; i++) { - unsigned long value; - - value = simple_strtoul(str, &str, 10); - while (isspace(*str)) - str++; - - result = -EFAULT; - if (put_user(value, vec + i)) - goto out_kfree; - - copied += sizeof(*vec); - if (!isdigit(*str)) - break; - } - } - - if (newval && newlen) { - unsigned long __user *vec = newval; - size_t length = newlen / sizeof(*vec); - char *str, *end; - int i; - loff_t pos = 0; - - str = buffer; - end = str + BUFSZ; - for (i = 0; i < length; i++) { - unsigned long value; - - result = -EFAULT; - if (get_user(value, vec + i)) - goto out_kfree; - - str += scnprintf(str, end - str, "%lu\t", value); - } - - result = kernel_write(file, buffer, str - buffer, &pos); - if (result < 0) - goto out_kfree; - } - result = copied; -out_kfree: - kfree(buffer); -out: - return result; -} - -static ssize_t bin_uuid(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - /* Only supports reads */ - if (oldval && oldlen) { - char buf[UUID_STRING_LEN + 1]; - uuid_t uuid; - loff_t pos = 0; - - result = kernel_read(file, buf, sizeof(buf) - 1, &pos); - if (result < 0) - goto out; - - buf[result] = '\0'; - - result = -EIO; - if (uuid_parse(buf, &uuid)) - goto out; - - if (oldlen > 16) - oldlen = 16; - - result = -EFAULT; - if (copy_to_user(oldval, &uuid, oldlen)) - goto out; - - copied = oldlen; - } - result = copied; -out: - return result; -} - -static ssize_t bin_dn_node_address(struct file *file, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - ssize_t result, copied = 0; - - if (oldval && oldlen) { - char buf[15], *nodep; - unsigned long area, node; - __le16 dnaddr; - loff_t pos = 0; - - result = kernel_read(file, buf, sizeof(buf) - 1, &pos); - if (result < 0) - goto out; - - buf[result] = '\0'; - - /* Convert the decnet address to binary */ - result = -EIO; - nodep = strchr(buf, '.'); - if (!nodep) - goto out; - ++nodep; - - area = simple_strtoul(buf, NULL, 10); - node = simple_strtoul(nodep, NULL, 10); - - result = -EIO; - if ((area > 63)||(node > 1023)) - goto out; - - dnaddr = cpu_to_le16((area << 10) | node); - - result = -EFAULT; - if (put_user(dnaddr, (__le16 __user *)oldval)) - goto out; - - copied = sizeof(dnaddr); - } - - if (newval && newlen) { - __le16 dnaddr; - char buf[15]; - int len; - loff_t pos = 0; - - result = -EINVAL; - if (newlen != sizeof(dnaddr)) - goto out; - - result = -EFAULT; - if (get_user(dnaddr, (__le16 __user *)newval)) - goto out; - - len = scnprintf(buf, sizeof(buf), "%hu.%hu", - le16_to_cpu(dnaddr) >> 10, - le16_to_cpu(dnaddr) & 0x3ff); - - result = kernel_write(file, buf, len, &pos); - if (result < 0) - goto out; - } - - result = copied; -out: - return result; -} - -static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) -{ - const struct bin_table *table = &bin_root_table[0]; - int ctl_name; - - /* The binary sysctl tables have a small maximum depth so - * there is no danger of overflowing our path as it PATH_MAX - * bytes long. - */ - memcpy(path, "sys/", 4); - path += 4; - -repeat: - if (!nlen) - return ERR_PTR(-ENOTDIR); - ctl_name = *name; - name++; - nlen--; - for ( ; table->convert; table++) { - int len = 0; - - /* - * For a wild card entry map from ifindex to network - * device name. - */ - if (!table->ctl_name) { -#ifdef CONFIG_NET - struct net *net = current->nsproxy->net_ns; - struct net_device *dev; - dev = dev_get_by_index(net, ctl_name); - if (dev) { - len = strlen(dev->name); - memcpy(path, dev->name, len); - dev_put(dev); - } -#endif - /* Use the well known sysctl number to proc name mapping */ - } else if (ctl_name == table->ctl_name) { - len = strlen(table->procname); - memcpy(path, table->procname, len); - } - if (len) { - path += len; - if (table->child) { - *path++ = '/'; - table = table->child; - goto repeat; - } - *path = '\0'; - return table; - } - } - return ERR_PTR(-ENOTDIR); -} - -static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) -{ - char *tmp, *result; - - result = ERR_PTR(-ENOMEM); - tmp = __getname(); - if (tmp) { - const struct bin_table *table = get_sysctl(name, nlen, tmp); - result = tmp; - *tablep = table; - if (IS_ERR(table)) { - __putname(tmp); - result = ERR_CAST(table); - } - } - return result; -} - -static ssize_t binary_sysctl(const int *name, int nlen, - void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) -{ - const struct bin_table *table = NULL; - struct vfsmount *mnt; - struct file *file; - ssize_t result; - char *pathname; - int flags; - - pathname = sysctl_getname(name, nlen, &table); - result = PTR_ERR(pathname); - if (IS_ERR(pathname)) - goto out; - - /* How should the sysctl be accessed? */ - if (oldval && oldlen && newval && newlen) { - flags = O_RDWR; - } else if (newval && newlen) { - flags = O_WRONLY; - } else if (oldval && oldlen) { - flags = O_RDONLY; - } else { - result = 0; - goto out_putname; - } - - mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); - result = PTR_ERR(file); - if (IS_ERR(file)) - goto out_putname; - - result = table->convert(file, oldval, oldlen, newval, newlen); - - fput(file); -out_putname: - __putname(pathname); -out: - return result; -} - - -#else /* CONFIG_SYSCTL_SYSCALL */ - static ssize_t binary_sysctl(const int *name, int nlen, void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) { return -ENOSYS; } -#endif /* CONFIG_SYSCTL_SYSCALL */ - - static void deprecated_sysctl_warning(const int *name, int nlen) { int i; -- cgit v1.2.3 From ff68dac6d65cd1347dad5d780dd8c90f29dc1b0b Mon Sep 17 00:00:00 2001 From: Gaowei Pu Date: Sat, 30 Nov 2019 17:51:03 -0800 Subject: mm/mmap.c: use IS_ERR_VALUE to check return value of get_unmapped_area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_unmapped_area() returns an address or -errno on failure. Historically we have checked for the failure by offset_in_page() which is correct but quite hard to read. Newer code started using IS_ERR_VALUE which is much easier to read. Convert remaining users of offset_in_page as well. [mhocko@suse.com: rewrite changelog] [mhocko@kernel.org: fix mremap.c and uprobes.c sites also] Link: http://lkml.kernel.org/r/20191012102512.28051-1-pugaowei@gmail.com Signed-off-by: Gaowei Pu Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c74761004ee5..ece7e13f6e4a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { + if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } -- cgit v1.2.3 From eafb149ed73a8bb8359c0ce027b98acd4e95b070 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:57 -0800 Subject: fork: support VMAP_STACK with KASAN_VMALLOC Supporting VMAP_STACK with KASAN_VMALLOC is straightforward: - clear the shadow region of vmapped stacks when swapping them in - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN Link: http://lkml.kernel.org/r/20191031093909.9228-4-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 0f0bac8318dd..21c6c1e29b98 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; + /* Clear the KASAN shadow of the stack. */ + kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -- cgit v1.2.3 From 204cb79ad42f015312a5bbd7012d09c93d9b46fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:56:08 -0800 Subject: kernel: sysctl: make drop_caches write-only Currently, the drop_caches proc file and sysctl read back the last value written, suggesting this is somehow a stateful setting instead of a one-time command. Make it write-only, like e.g. compact_memory. While mitigating a VM problem at scale in our fleet, there was confusion about whether writing to this file will permanently switch the kernel into a non-caching mode. This influences the decision making in a tense situation, where tens of people are trying to fix tens of thousands of affected machines: Do we need a rollback strategy? What are the performance implications of operating in a non-caching state for several days? It also caused confusion when the kernel team said we may need to write the file several times to make sure it's effective ("But it already reads back 3?"). Link: http://lkml.kernel.org/r/20191031221602.9375-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Chris Down Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b6f2f35d0bcf..70665934d53e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, .extra2 = &four, -- cgit v1.2.3