summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/auditsc.c4
-rw-r--r--kernel/bpf/helpers.c4
-rw-r--r--kernel/bpf/verifier.c10
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/namespace.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/elfcore.c26
-rw-r--r--kernel/entry/Makefile2
-rw-r--r--kernel/entry/common.c188
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/kvm.c3
-rw-r--r--kernel/entry/syscall_user_dispatch.c104
-rw-r--r--kernel/events/core.c160
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/fork.c13
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/irq/irqdomain.c13
-rw-r--r--kernel/kcsan/encoding.h20
-rw-r--r--kernel/kcsan/selftest.c3
-rw-r--r--kernel/kexec_core.c1
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kprobes.c314
-rw-r--r--kernel/locking/lock_events_list.h6
-rw-r--r--kernel/locking/locktorture.c36
-rw-r--r--kernel/locking/rwsem.c383
-rw-r--r--kernel/nsproxy.c13
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/rcu/Kconfig20
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcu_segcblist.h2
-rw-r--r--kernel/rcu/rcuscale.c37
-rw-r--r--kernel/rcu/rcutorture.c52
-rw-r--r--kernel/rcu/refscale.c11
-rw-r--r--kernel/rcu/srcutree.c6
-rw-r--r--kernel/rcu/tasks.h49
-rw-r--r--kernel/rcu/tree.c200
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/rcu/tree_stall.h6
-rw-r--r--kernel/scftorture.c49
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/idle.c28
-rw-r--r--kernel/sched/membarrier.c77
-rw-r--r--kernel/scs.c71
-rw-r--r--kernel/seccomp.c6
-rw-r--r--kernel/signal.c62
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/task_work.c41
-rw-r--r--kernel/time/hrtimer.c2
-rw-r--r--kernel/time/jiffies.c3
-rw-r--r--kernel/time/namespace.c15
-rw-r--r--kernel/time/ntp.c229
-rw-r--r--kernel/time/ntp_internal.h7
-rw-r--r--kernel/time/tick-broadcast.c25
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-sched.c122
-rw-r--r--kernel/time/timeconv.c6
-rw-r--r--kernel/time/timekeeping.c85
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timer.c57
-rw-r--r--kernel/time/timer_list.c66
-rw-r--r--kernel/torture.c34
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/ftrace.c22
-rw-r--r--kernel/trace/ring_buffer.c20
-rw-r--r--kernel/trace/trace.c15
-rw-r--r--kernel/trace/trace_events.c8
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_kprobe.c3
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--kernel/utsname.c7
82 files changed, 1732 insertions, 1149 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index af601b9bda0e..6c9f19911be0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -97,7 +97,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_ELFCORE) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dba8f0983b5..c00aa5837965 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -952,7 +952,7 @@ int audit_alloc(struct task_struct *tsk)
state = audit_filter_task(tsk, &key);
if (state == AUDIT_DISABLED) {
- clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ clear_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
@@ -964,7 +964,7 @@ int audit_alloc(struct task_struct *tsk)
context->filterkey = key;
audit_set_context(tsk, context);
- set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ set_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 25520f5eeaf6..deda1185237b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -717,9 +717,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_snprintf_btf_proto;
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
- case BPF_FUNC_bpf_per_cpu_ptr:
+ case BPF_FUNC_per_cpu_ptr:
return &bpf_per_cpu_ptr_proto;
- case BPF_FUNC_bpf_this_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
default:
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1388bf733071..53fe6ef6d931 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1298,9 +1298,7 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
static bool __reg64_bound_s32(s64 a)
{
- if (a > S32_MIN && a < S32_MAX)
- return true;
- return false;
+ return a > S32_MIN && a < S32_MAX;
}
static bool __reg64_bound_u32(u64 a)
@@ -1314,10 +1312,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg)
{
__mark_reg32_unbounded(reg);
- if (__reg64_bound_s32(reg->smin_value))
+ if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) {
reg->s32_min_value = (s32)reg->smin_value;
- if (__reg64_bound_s32(reg->smax_value))
reg->s32_max_value = (s32)reg->smax_value;
+ }
if (__reg64_bound_u32(reg->umin_value))
reg->u32_min_value = (u32)reg->umin_value;
if (__reg64_bound_u32(reg->umax_value))
@@ -4895,6 +4893,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
ret_reg->smax_value = meta->msize_max_value;
ret_reg->s32_max_value = meta->msize_max_value;
+ ret_reg->smin_value = -MAX_ERRNO;
+ ret_reg->s32_min_value = -MAX_ERRNO;
__reg_deduce_bounds(ret_reg);
__reg_bound_offset(ret_reg);
__update_reg_bounds(ret_reg);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e41c21819ba0..f2eeff74d713 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -199,7 +199,7 @@ static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = REFCOUNT_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index 812a61afd538..f5e8828c109c 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -32,7 +32,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
kfree(new_ns);
return ERR_PTR(ret);
}
- refcount_set(&new_ns->count, 1);
+ refcount_set(&new_ns->ns.count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fa535eaa4826..4e11e91010e1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -815,6 +815,10 @@ void __init cpuhp_threads_init(void)
}
#ifdef CONFIG_HOTPLUG_CPU
+#ifndef arch_clear_mm_cpumask_cpu
+#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
+#endif
+
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
* @cpu: a CPU id
@@ -850,7 +854,7 @@ void clear_tasks_mm_cpumask(int cpu)
t = find_lock_task_mm(p);
if (!t)
continue;
- cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
+ arch_clear_mm_cpumask_cpu(cpu, t->mm);
task_unlock(t);
}
rcu_read_unlock();
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 106e4500fd53..4fcfe0b70c4e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -11,7 +11,7 @@
#include <asm/page.h>
#include <asm/sections.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
/* vmcoreinfo stuff */
unsigned char *vmcoreinfo_data;
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
deleted file mode 100644
index 57fb4dcff434..000000000000
--- a/kernel/elfcore.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/elf.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/binfmts.h>
-#include <linux/elfcore.h>
-
-Elf_Half __weak elf_core_extra_phdrs(void)
-{
- return 0;
-}
-
-int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
- return 1;
-}
-
-int __weak elf_core_write_extra_data(struct coredump_params *cprm)
-{
- return 1;
-}
-
-size_t __weak elf_core_extra_data_size(void)
-{
- return 0;
-}
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index 34c8a3f1c735..095c775e001e 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
CFLAGS_common.o += -fno-stack-protector
-obj-$(CONFIG_GENERIC_ENTRY) += common.o
+obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o
obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e9e2df3f3f9e..d6b73937dab3 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,20 +5,13 @@
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include "common.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-/**
- * enter_from_user_mode - Establish state when coming from user mode
- *
- * Syscall/interrupt entry disables interrupts, but user mode is traced as
- * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
- *
- * 1) Tell lockdep that interrupts are disabled
- * 2) Invoke context tracking if enabled to reactivate RCU
- * 3) Trace interrupts off state
- */
-static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+/* See comment for enter_from_user_mode() in entry-common.h */
+static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
{
arch_check_user_regs(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
@@ -31,6 +24,11 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
instrumentation_end();
}
+void noinstr enter_from_user_mode(struct pt_regs *regs)
+{
+ __enter_from_user_mode(regs);
+}
+
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
@@ -42,19 +40,29 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
}
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long ti_work)
+ unsigned long work)
{
long ret = 0;
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
/* Handle ptrace */
- if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
ret = arch_syscall_enter_tracehook(regs);
- if (ret || (ti_work & _TIF_SYSCALL_EMU))
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
/* Do seccomp after ptrace, to catch any tracer changes. */
- if (ti_work & _TIF_SECCOMP) {
+ if (work & SYSCALL_WORK_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
@@ -63,7 +71,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
- if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
@@ -74,11 +82,10 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
- unsigned long ti_work;
+ unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
- ti_work = READ_ONCE(current_thread_info()->flags);
- if (ti_work & SYSCALL_ENTER_WORK)
- syscall = syscall_trace_enter(regs, syscall, ti_work);
+ if (work & SYSCALL_WORK_ENTER)
+ syscall = syscall_trace_enter(regs, syscall, work);
return syscall;
}
@@ -92,7 +99,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
long ret;
- enter_from_user_mode(regs);
+ __enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
@@ -104,25 +111,14 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
- enter_from_user_mode(regs);
+ __enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
-/**
- * exit_to_user_mode - Fixup state when exiting to user mode
- *
- * Syscall/interupt exit enables interrupts, but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about it.
- *
- * 1) Trace interrupts on state
- * 2) Invoke context tracking if enabled to adjust RCU state
- * 3) Invoke architecture specific last minute exit code, e.g. speculation
- * mitigations, etc.
- * 4) Tell lockdep that interrupts are enabled
- */
-static __always_inline void exit_to_user_mode(void)
+/* See comment for exit_to_user_mode() in entry-common.h */
+static __always_inline void __exit_to_user_mode(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
@@ -134,8 +130,21 @@ static __always_inline void exit_to_user_mode(void)
lockdep_hardirqs_on(CALLER_ADDR0);
}
+void noinstr exit_to_user_mode(void)
+{
+ __exit_to_user_mode();
+}
+
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal(struct pt_regs *regs) { }
+void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
+
+static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
+{
+ if (ti_work & _TIF_NOTIFY_SIGNAL)
+ tracehook_notify_signal();
+
+ arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
+}
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
@@ -157,8 +166,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
- if (ti_work & _TIF_SIGPENDING)
- arch_do_signal(regs);
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME) {
tracehook_notify_resume(regs);
@@ -199,35 +208,50 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
}
#ifndef _TIF_SINGLESTEP
-static inline bool report_single_step(unsigned long ti_work)
+static inline bool report_single_step(unsigned long work)
{
return false;
}
#else
/*
- * If TIF_SYSCALL_EMU is set, then the only reason to report is when
+ * If SYSCALL_EMU is set, then the only reason to report is when
* TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
* instruction has been already reported in syscall_enter_from_user_mode().
*/
-#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
-
-static inline bool report_single_step(unsigned long ti_work)
+static inline bool report_single_step(unsigned long work)
{
- return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
+ if (!(work & SYSCALL_WORK_SYSCALL_EMU))
+ return false;
+
+ return !!(current_thread_info()->flags & _TIF_SINGLESTEP);
}
#endif
-static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
audit_syscall_exit(regs);
- if (ti_work & _TIF_SYSCALL_TRACEPOINT)
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
trace_sys_exit(regs, syscall_get_return_value(current, regs));
- step = report_single_step(ti_work);
- if (step || ti_work & _TIF_SYSCALL_TRACE)
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
arch_syscall_exit_tracehook(regs, step);
}
@@ -237,7 +261,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
*/
static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
{
- u32 cached_flags = READ_ONCE(current_thread_info()->flags);
+ unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
unsigned long nr = syscall_get_nr(current, regs);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -254,23 +278,33 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
* enabled, we want to run them exactly once per syscall exit with
* interrupts enabled.
*/
- if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
- syscall_exit_work(regs, cached_flags);
+ if (unlikely(work & SYSCALL_WORK_EXIT))
+ syscall_exit_work(regs, work);
}
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
- instrumentation_begin();
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
exit_to_user_mode_prepare(regs);
+}
+
+void syscall_exit_to_user_mode_work(struct pt_regs *regs)
+{
+ __syscall_exit_to_user_mode_work(regs);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __syscall_exit_to_user_mode_work(regs);
instrumentation_end();
- exit_to_user_mode();
+ __exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
- enter_from_user_mode(regs);
+ __enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
@@ -278,7 +312,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
- exit_to_user_mode();
+ __exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
@@ -296,7 +330,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* If this entry hit the idle task invoke rcu_irq_enter() whether
* RCU is watching or not.
*
- * Interupts can nest when the first interrupt invokes softirq
+ * Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
@@ -307,7 +341,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interupt and eventually claim
- * quiescient state and end grace periods prematurely.
+ * quiescent state and end grace periods prematurely.
*
* Unconditionally invoke rcu_irq_enter() so RCU state stays
* consistent.
@@ -319,7 +353,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
- * as in irq_enter_from_user_mode().
+ * as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
rcu_irq_enter();
@@ -397,3 +431,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
rcu_irq_exit();
}
}
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ irq_state.lockdep = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ rcu_nmi_enter();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (irq_state.lockdep) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ }
+ instrumentation_end();
+
+ rcu_nmi_exit();
+ lockdep_hardirq_exit();
+ if (irq_state.lockdep)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}
diff --git a/kernel/entry/common.h b/kernel/entry/common.h
new file mode 100644
index 000000000000..f6e6d02f07fe
--- /dev/null
+++ b/kernel/entry/common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _COMMON_H
+#define _COMMON_H
+
+bool syscall_user_dispatch(struct pt_regs *regs);
+
+#endif
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index b6678a5e3cf6..49972ee99aff 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
+ if (ti_work & _TIF_NOTIFY_SIGNAL)
+ tracehook_notify_signal();
+
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
return -EINTR;
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
new file mode 100644
index 000000000000..b0338a5625d9
--- /dev/null
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Collabora Ltd.
+ */
+#include <linux/sched.h>
+#include <linux/prctl.h>
+#include <linux/syscall_user_dispatch.h>
+#include <linux/uaccess.h>
+#include <linux/signal.h>
+#include <linux/elf.h>
+
+#include <linux/sched/signal.h>
+#include <linux/sched/task_stack.h>
+
+#include <asm/syscall.h>
+
+#include "common.h"
+
+static void trigger_sigsys(struct pt_regs *regs)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSYS;
+ info.si_code = SYS_USER_DISPATCH;
+ info.si_call_addr = (void __user *)KSTK_EIP(current);
+ info.si_errno = 0;
+ info.si_arch = syscall_get_arch(current);
+ info.si_syscall = syscall_get_nr(current, regs);
+
+ force_sig_info(&info);
+}
+
+bool syscall_user_dispatch(struct pt_regs *regs)
+{
+ struct syscall_user_dispatch *sd = &current->syscall_dispatch;
+ char state;
+
+ if (likely(instruction_pointer(regs) - sd->offset < sd->len))
+ return false;
+
+ if (unlikely(arch_syscall_is_vdso_sigreturn(regs)))
+ return false;
+
+ if (likely(sd->selector)) {
+ /*
+ * access_ok() is performed once, at prctl time, when
+ * the selector is loaded by userspace.
+ */
+ if (unlikely(__get_user(state, sd->selector)))
+ do_exit(SIGSEGV);
+
+ if (likely(state == PR_SYS_DISPATCH_OFF))
+ return false;
+
+ if (state != PR_SYS_DISPATCH_ON)
+ do_exit(SIGSYS);
+ }
+
+ sd->on_dispatch = true;
+ syscall_rollback(current, regs);
+ trigger_sigsys(regs);
+
+ return true;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ switch (mode) {
+ case PR_SYS_DISPATCH_OFF:
+ if (offset || len || selector)
+ return -EINVAL;
+ break;
+ case PR_SYS_DISPATCH_ON:
+ /*
+ * Validate the direct dispatcher region just for basic
+ * sanity against overflow and a 0-sized dispatcher
+ * region. If the user is able to submit a syscall from
+ * an address, that address is obviously valid.
+ */
+ if (offset && offset + len <= offset)
+ return -EINVAL;
+
+ if (selector && !access_ok(selector, sizeof(*selector)))
+ return -EFAULT;
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ current->syscall_dispatch.selector = selector;
+ current->syscall_dispatch.offset = offset;
+ current->syscall_dispatch.len = len;
+ current->syscall_dispatch.on_dispatch = false;
+
+ if (mode == PR_SYS_DISPATCH_ON)
+ set_syscall_work(SYSCALL_USER_DISPATCH);
+ else
+ clear_syscall_work(SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dc568ca295bd..19ae6c931c52 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,8 @@
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
+#include <linux/highmem.h>
+#include <linux/pgtable.h>
#include "internal.h"
@@ -1895,6 +1897,12 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ size += sizeof(data->code_page_size);
+
event->header_size = size;
}
@@ -6931,6 +6939,12 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CGROUP)
perf_output_put(handle, data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ perf_output_put(handle, data->code_page_size);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
@@ -6988,6 +7002,93 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
+/*
+ * Return the pagetable size of a given virtual address.
+ */
+static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
+{
+ u64 size = 0;
+
+#ifdef CONFIG_HAVE_FAST_GUP
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = READ_ONCE(*pgdp);
+ if (pgd_none(pgd))
+ return 0;
+
+ if (pgd_leaf(pgd))
+ return pgd_leaf_size(pgd);
+
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
+ p4d = READ_ONCE(*p4dp);
+ if (!p4d_present(p4d))
+ return 0;
+
+ if (p4d_leaf(p4d))
+ return p4d_leaf_size(p4d);
+
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
+ pud = READ_ONCE(*pudp);
+ if (!pud_present(pud))
+ return 0;
+
+ if (pud_leaf(pud))
+ return pud_leaf_size(pud);
+
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
+ pmd = READ_ONCE(*pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (pmd_leaf(pmd))
+ return pmd_leaf_size(pmd);
+
+ ptep = pte_offset_map(&pmd, addr);
+ pte = ptep_get_lockless(ptep);
+ if (pte_present(pte))
+ size = pte_leaf_size(pte);
+ pte_unmap(ptep);
+#endif /* CONFIG_HAVE_FAST_GUP */
+
+ return size;
+}
+
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = perf_get_pgtable_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
@@ -7023,7 +7124,7 @@ void perf_prepare_sample(struct perf_event_header *header,
__perf_event_header__init_id(header, data, event);
- if (sample_type & PERF_SAMPLE_IP)
+ if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
data->ip = perf_instruction_pointer(regs);
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
@@ -7142,6 +7243,17 @@ void perf_prepare_sample(struct perf_event_header *header,
}
#endif
+ /*
+ * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
+ * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
+ * but the value will not dump to the userspace.
+ */
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ data->data_page_size = perf_get_page_size(data->addr);
+
+ if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
+ data->code_page_size = perf_get_page_size(data->ip);
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
@@ -11720,24 +11832,6 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
- if (task) {
- err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
- if (err)
- goto err_task;
-
- /*
- * Preserve ptrace permission check for backwards compatibility.
- *
- * We must hold exec_update_mutex across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
- goto err_cred;
- }
-
if (flags & PERF_FLAG_PID_CGROUP)
cgroup_fd = pid;
@@ -11745,7 +11839,7 @@ SYSCALL_DEFINE5(perf_event_open,
NULL, NULL, cgroup_fd);
if (IS_ERR(event)) {
err = PTR_ERR(event);
- goto err_cred;
+ goto err_task;
}
if (is_sampling_event(event)) {
@@ -11864,6 +11958,24 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
}
+ if (task) {
+ err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
+ if (err)
+ goto err_file;
+
+ /*
+ * Preserve ptrace permission check for backwards compatibility.
+ *
+ * We must hold exec_update_mutex across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+ goto err_cred;
+ }
+
if (move_group) {
gctx = __perf_event_ctx_lock_double(group_leader, ctx);
@@ -12039,7 +12151,10 @@ err_locked:
if (move_group)
perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
-/* err_file: */
+err_cred:
+ if (task)
+ mutex_unlock(&task->signal->exec_update_mutex);
+err_file:
fput(event_file);
err_context:
perf_unpin_context(ctx);
@@ -12051,9 +12166,6 @@ err_alloc:
*/
if (!event_file)
free_event(event);
-err_cred:
- if (task)
- mutex_unlock(&task->signal->exec_update_mutex);
err_task:
if (task)
put_task_struct(task);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 00b0358739ab..bf9edd8d75be 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
- if (signal_pending(t)) {
+ if (task_sigpending(t)) {
spin_lock_irq(&t->sighand->siglock);
clear_tsk_thread_flag(t, TIF_SIGPENDING);
spin_unlock_irq(&t->sighand->siglock);
diff --git a/kernel/fork.c b/kernel/fork.c
index 6d266388d380..1513832de220 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
clear_user_return_notifier(tsk);
clear_tsk_need_resched(tsk);
set_task_stack_end_magic(tsk);
+ clear_syscall_work_syscall_user_dispatch(tsk);
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
@@ -1625,7 +1626,7 @@ static void copy_seccomp(struct task_struct *p)
* to manually enable the seccomp thread flag here.
*/
if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
- set_tsk_thread_flag(p, TIF_SECCOMP);
+ set_task_syscall_work(p, SECCOMP);
#endif
}
@@ -2158,9 +2159,9 @@ static __latent_entropy struct task_struct *copy_process(
* child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
- clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(p, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(p, SYSCALL_EMU);
#endif
clear_tsk_latency_tracing(p);
@@ -2182,6 +2183,10 @@ static __latent_entropy struct task_struct *copy_process(
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+#ifdef CONFIG_KRETPROBES
+ p->kretprobe_instances.first = NULL;
+#endif
+
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted that the new process's css_set can be changed
diff --git a/kernel/futex.c b/kernel/futex.c
index 00259c7e288e..c47d1015d759 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -310,8 +310,6 @@ static inline bool should_fail_futex(bool fshared)
#ifdef CONFIG_COMPAT
static void compat_exit_robust_list(struct task_struct *curr);
-#else
-static inline void compat_exit_robust_list(struct task_struct *curr) { }
#endif
/*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf8b374b892d..e4ca69608f3b 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain)
EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
/**
- * irq_create_mapping() - Map a hardware interrupt into linux irq space
+ * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
* @domain: domain owning this hardware interrupt or NULL for default domain
* @hwirq: hardware irq number in that domain space
+ * @affinity: irq affinity
*
* Only one mapping per hardware interrupt is permitted. Returns a linux
* irq number.
* If the sense/trigger is to be specified, set_irq_type() should be called
* on the number returned from that call.
*/
-unsigned int irq_create_mapping(struct irq_domain *domain,
- irq_hw_number_t hwirq)
+unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
+ irq_hw_number_t hwirq,
+ const struct irq_affinity_desc *affinity)
{
struct device_node *of_node;
int virq;
@@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
}
/* Allocate a virtual interrupt number */
- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
+ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
+ affinity);
if (virq <= 0) {
pr_debug("-> virq allocation failed\n");
return 0;
@@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain,
return virq;
}
-EXPORT_SYMBOL_GPL(irq_create_mapping);
+EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
/**
* irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
index 1a6db2f797ac..7ee405524904 100644
--- a/kernel/kcsan/encoding.h
+++ b/kernel/kcsan/encoding.h
@@ -37,18 +37,20 @@
*/
#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
-/*
- * Masks to set/retrieve the encoded data.
- */
-#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
-#define WATCHPOINT_SIZE_MASK \
- GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS)
-#define WATCHPOINT_ADDR_MASK \
- GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0)
+/* Bitmasks for the encoded watchpoint access information. */
+#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
+#define WATCHPOINT_SIZE_MASK GENMASK(BITS_PER_LONG-2, WATCHPOINT_ADDR_BITS)
+#define WATCHPOINT_ADDR_MASK GENMASK(WATCHPOINT_ADDR_BITS-1, 0)
+static_assert(WATCHPOINT_ADDR_MASK == (1UL << WATCHPOINT_ADDR_BITS) - 1);
+static_assert((WATCHPOINT_WRITE_MASK ^ WATCHPOINT_SIZE_MASK ^ WATCHPOINT_ADDR_MASK) == ~0UL);
static inline bool check_encodable(unsigned long addr, size_t size)
{
- return size <= MAX_ENCODABLE_SIZE;
+ /*
+ * While we can encode addrs<PAGE_SIZE, avoid crashing with a NULL
+ * pointer deref inside KCSAN.
+ */
+ return addr >= PAGE_SIZE && size <= MAX_ENCODABLE_SIZE;
}
static inline long
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index d98bc208d06d..9014a3a82cf9 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -33,6 +33,9 @@ static bool test_encode_decode(void)
unsigned long addr;
prandom_bytes(&addr, sizeof(addr));
+ if (addr < PAGE_SIZE)
+ addr = PAGE_SIZE;
+
if (WARN_ON(!check_encodable(addr, size)))
return false;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8798a8183974..4f8efc278aa7 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -42,7 +42,6 @@
#include <asm/sections.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
#include "kexec_internal.h"
DEFINE_MUTEX(kexec_mutex);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index e21f6b9234f7..b02086d70492 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -20,7 +20,7 @@
#include <linux/fs.h>
#include <linux/ima.h>
#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha2.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
#include <linux/kernel.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 41fdbb7953c6..f7fb5d135930 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,6 @@
#include <linux/cpu.h>
#include <linux/jump_label.h>
#include <linux/perf_event.h>
-#include <linux/static_call.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -54,7 +53,6 @@ static int kprobes_initialized;
* - RCU hlist traversal under disabling preempt (breakpoint handlers)
*/
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
-static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_all_disarmed;
@@ -62,9 +60,6 @@ static bool kprobes_all_disarmed;
/* This protects kprobe_table and optimizing_list */
static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
-static struct {
- raw_spinlock_t lock ____cacheline_aligned_in_smp;
-} kretprobe_table_locks[KPROBE_TABLE_SIZE];
kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
unsigned int __unused)
@@ -72,11 +67,6 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
}
-static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
-{
- return &(kretprobe_table_locks[hash].lock);
-}
-
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
@@ -1224,76 +1214,26 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-static void recycle_rp_inst(struct kretprobe_instance *ri)
+static void free_rp_inst_rcu(struct rcu_head *head)
{
- struct kretprobe *rp = ri->rp;
+ struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
- /* remove rp inst off the rprobe_inst_table */
- hlist_del(&ri->hlist);
- INIT_HLIST_NODE(&ri->hlist);
- if (likely(rp)) {
- raw_spin_lock(&rp->lock);
- hlist_add_head(&ri->hlist, &rp->free_instances);
- raw_spin_unlock(&rp->lock);
- } else
- kfree_rcu(ri, rcu);
+ if (refcount_dec_and_test(&ri->rph->ref))
+ kfree(ri->rph);
+ kfree(ri);
}
-NOKPROBE_SYMBOL(recycle_rp_inst);
-
-static void kretprobe_hash_lock(struct task_struct *tsk,
- struct hlist_head **head, unsigned long *flags)
-__acquires(hlist_lock)
-{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
+NOKPROBE_SYMBOL(free_rp_inst_rcu);
- *head = &kretprobe_inst_table[hash];
- hlist_lock = kretprobe_table_lock_ptr(hash);
- /*
- * Nested is a workaround that will soon not be needed.
- * There's other protections that make sure the same lock
- * is not taken on the same CPU that lockdep is unaware of.
- * Differentiate when it is taken in NMI context.
- */
- raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
-}
-NOKPROBE_SYMBOL(kretprobe_hash_lock);
-
-static void kretprobe_table_lock(unsigned long hash,
- unsigned long *flags)
-__acquires(hlist_lock)
-{
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- /*
- * Nested is a workaround that will soon not be needed.
- * There's other protections that make sure the same lock
- * is not taken on the same CPU that lockdep is unaware of.
- * Differentiate when it is taken in NMI context.
- */
- raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
-}
-NOKPROBE_SYMBOL(kretprobe_table_lock);
-
-static void kretprobe_hash_unlock(struct task_struct *tsk,
- unsigned long *flags)
-__releases(hlist_lock)
+static void recycle_rp_inst(struct kretprobe_instance *ri)
{
- unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- raw_spinlock_t *hlist_lock;
-
- hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_unlock_irqrestore(hlist_lock, *flags);
-}
-NOKPROBE_SYMBOL(kretprobe_hash_unlock);
+ struct kretprobe *rp = get_kretprobe(ri);
-static void kretprobe_table_unlock(unsigned long hash,
- unsigned long *flags)
-__releases(hlist_lock)
-{
- raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- raw_spin_unlock_irqrestore(hlist_lock, *flags);
+ if (likely(rp)) {
+ freelist_add(&ri->freelist, &rp->freelist);
+ } else
+ call_rcu(&ri->rcu, free_rp_inst_rcu);
}
-NOKPROBE_SYMBOL(kretprobe_table_unlock);
+NOKPROBE_SYMBOL(recycle_rp_inst);
static struct kprobe kprobe_busy = {
.addr = (void *) get_kprobe,
@@ -1324,24 +1264,21 @@ void kprobe_busy_end(void)
void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
- struct hlist_head *head;
- struct hlist_node *tmp;
- unsigned long hash, flags = 0;
+ struct llist_node *node;
+ /* Early boot, not yet initialized. */
if (unlikely(!kprobes_initialized))
- /* Early boot. kretprobe_table_locks not yet initialized. */
return;
kprobe_busy_begin();
- hash = hash_ptr(tk, KPROBE_HASH_BITS);
- head = &kretprobe_inst_table[hash];
- kretprobe_table_lock(hash, &flags);
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task == tk)
- recycle_rp_inst(ri);
+ node = __llist_del_all(&tk->kretprobe_instances);
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
+ node = node->next;
+
+ recycle_rp_inst(ri);
}
- kretprobe_table_unlock(hash, &flags);
kprobe_busy_end();
}
@@ -1350,37 +1287,23 @@ NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- struct hlist_node *next;
+ struct freelist_node *node;
+ int count = 0;
+
+ node = rp->freelist.head;
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, freelist);
+ node = node->next;
- hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
- hlist_del(&ri->hlist);
kfree(ri);
+ count++;
}
-}
-
-static void cleanup_rp_inst(struct kretprobe *rp)
-{
- unsigned long flags, hash;
- struct kretprobe_instance *ri;
- struct hlist_node *next;
- struct hlist_head *head;
- /* To avoid recursive kretprobe by NMI, set kprobe busy here */
- kprobe_busy_begin();
- for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
- kretprobe_table_lock(hash, &flags);
- head = &kretprobe_inst_table[hash];
- hlist_for_each_entry_safe(ri, next, head, hlist) {
- if (ri->rp == rp)
- ri->rp = NULL;
- }
- kretprobe_table_unlock(hash, &flags);
+ if (refcount_sub_and_test(count, &rp->rph->ref)) {
+ kfree(rp->rph);
+ rp->rph = NULL;
}
- kprobe_busy_end();
-
- free_rp_inst(rp);
}
-NOKPROBE_SYMBOL(cleanup_rp_inst);
/* Add the new probe to ap->list */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
@@ -1643,7 +1566,6 @@ static int check_kprobe_address_safe(struct kprobe *p,
if (!kernel_text_address((unsigned long) p->addr) ||
within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr) ||
- static_call_text_reserved(p->addr, p->addr) ||
find_bug((unsigned long)p->addr)) {
ret = -EINVAL;
goto out;
@@ -1942,88 +1864,56 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs,
void *trampoline_address,
void *frame_pointer)
{
- struct kretprobe_instance *ri = NULL, *last = NULL;
- struct hlist_head *head;
- struct hlist_node *tmp;
- unsigned long flags;
kprobe_opcode_t *correct_ret_addr = NULL;
- bool skipped = false;
+ struct kretprobe_instance *ri = NULL;
+ struct llist_node *first, *node;
+ struct kretprobe *rp;
- kretprobe_hash_lock(current, &head, &flags);
+ /* Find all nodes for this frame. */
+ first = node = current->kretprobe_instances.first;
+ while (node) {
+ ri = container_of(node, struct kretprobe_instance, llist);
- /*
- * It is possible to have multiple instances associated with a given
- * task either because multiple functions in the call path have
- * return probes installed on them, and/or more than one
- * return probe was registered for a target function.
- *
- * We can handle this because:
- * - instances are always pushed into the head of the list
- * - when multiple return probes are registered for the same
- * function, the (chronologically) first instance's ret_addr
- * will be the real return address, and all the rest will
- * point to kretprobe_trampoline.
- */
- hlist_for_each_entry(ri, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
- /*
- * Return probes must be pushed on this hash list correct
- * order (same as return order) so that it can be popped
- * correctly. However, if we find it is pushed it incorrect
- * order, this means we find a function which should not be
- * probed, because the wrong order entry is pushed on the
- * path of processing other kretprobe itself.
- */
- if (ri->fp != frame_pointer) {
- if (!skipped)
- pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
- skipped = true;
- continue;
- }
+ BUG_ON(ri->fp != frame_pointer);
- correct_ret_addr = ri->ret_addr;
- if (skipped)
- pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
- ri->rp->kp.addr);
-
- if (correct_ret_addr != trampoline_address)
+ if (ri->ret_addr != trampoline_address) {
+ correct_ret_addr = ri->ret_addr;
/*
* This is the real return address. Any other
* instances associated with this task are for
* other calls deeper on the call stack
*/
- break;
+ goto found;
+ }
+
+ node = node->next;
}
+ pr_err("Oops! Kretprobe fails to find correct return address.\n");
+ BUG_ON(1);
- BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address));
- last = ri;
+found:
+ /* Unlink all nodes for this frame. */
+ current->kretprobe_instances.first = node->next;
+ node->next = NULL;
- hlist_for_each_entry_safe(ri, tmp, head, hlist) {
- if (ri->task != current)
- /* another task is sharing our hash bucket */
- continue;
- if (ri->fp != frame_pointer)
- continue;
+ /* Run them.. */
+ while (first) {
+ ri = container_of(first, struct kretprobe_instance, llist);
+ first = first->next;
- if (ri->rp && ri->rp->handler) {
+ rp = get_kretprobe(ri);
+ if (rp && rp->handler) {
struct kprobe *prev = kprobe_running();
- __this_cpu_write(current_kprobe, &ri->rp->kp);
+ __this_cpu_write(current_kprobe, &rp->kp);
ri->ret_addr = correct_ret_addr;
- ri->rp->handler(ri, regs);
+ rp->handler(ri, regs);
__this_cpu_write(current_kprobe, prev);
}
recycle_rp_inst(ri);
-
- if (ri == last)
- break;
}
- kretprobe_hash_unlock(current, &flags);
-
return (unsigned long)correct_ret_addr;
}
NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
@@ -2035,44 +1925,26 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler)
static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
- unsigned long hash, flags = 0;
struct kretprobe_instance *ri;
+ struct freelist_node *fn;
- /* TODO: consider to only swap the RA after the last pre_handler fired */
- hash = hash_ptr(current, KPROBE_HASH_BITS);
- /*
- * Nested is a workaround that will soon not be needed.
- * There's other protections that make sure the same lock
- * is not taken on the same CPU that lockdep is unaware of.
- */
- raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
- if (!hlist_empty(&rp->free_instances)) {
- ri = hlist_entry(rp->free_instances.first,
- struct kretprobe_instance, hlist);
- hlist_del(&ri->hlist);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
-
- ri->rp = rp;
- ri->task = current;
-
- if (rp->entry_handler && rp->entry_handler(ri, regs)) {
- raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
- hlist_add_head(&ri->hlist, &rp->free_instances);
- raw_spin_unlock_irqrestore(&rp->lock, flags);
- return 0;
- }
+ fn = freelist_try_get(&rp->freelist);
+ if (!fn) {
+ rp->nmissed++;
+ return 0;
+ }
- arch_prepare_kretprobe(ri, regs);
+ ri = container_of(fn, struct kretprobe_instance, freelist);
- /* XXX(hch): why is there no hlist_move_head? */
- INIT_HLIST_NODE(&ri->hlist);
- kretprobe_table_lock(hash, &flags);
- hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
- kretprobe_table_unlock(hash, &flags);
- } else {
- rp->nmissed++;
- raw_spin_unlock_irqrestore(&rp->lock, flags);
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ freelist_add(&ri->freelist, &rp->freelist);
+ return 0;
}
+
+ arch_prepare_kretprobe(ri, regs);
+
+ __llist_add(&ri->llist, &current->kretprobe_instances);
+
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
@@ -2129,18 +2001,24 @@ int register_kretprobe(struct kretprobe *rp)
rp->maxactive = num_possible_cpus();
#endif
}
- raw_spin_lock_init(&rp->lock);
- INIT_HLIST_HEAD(&rp->free_instances);
+ rp->freelist.head = NULL;
+ rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
+ if (!rp->rph)
+ return -ENOMEM;
+
+ rp->rph->rp = rp;
for (i = 0; i < rp->maxactive; i++) {
- inst = kmalloc(sizeof(struct kretprobe_instance) +
+ inst = kzalloc(sizeof(struct kretprobe_instance) +
rp->data_size, GFP_KERNEL);
if (inst == NULL) {
+ refcount_set(&rp->rph->ref, i);
free_rp_inst(rp);
return -ENOMEM;
}
- INIT_HLIST_NODE(&inst->hlist);
- hlist_add_head(&inst->hlist, &rp->free_instances);
+ inst->rph = rp->rph;
+ freelist_add(&inst->freelist, &rp->freelist);
}
+ refcount_set(&rp->rph->ref, i);
rp->nmissed = 0;
/* Establish function entry probe point */
@@ -2182,16 +2060,18 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
if (num <= 0)
return;
mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
+ for (i = 0; i < num; i++) {
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
+ rps[i]->rph->rp = NULL;
+ }
mutex_unlock(&kprobe_mutex);
synchronize_rcu();
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
- cleanup_rp_inst(rps[i]);
+ free_rp_inst(rps[i]);
}
}
}
@@ -2235,9 +2115,6 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
- if (WARN_ON_ONCE(kprobe_gone(p)))
- return;
-
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2518,10 +2395,7 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry(p, head, hlist) {
- if (kprobe_gone(p))
- continue;
-
+ hlist_for_each_entry(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2538,7 +2412,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
*/
kill_kprobe(p);
}
- }
}
if (val == MODULE_STATE_GOING)
remove_module_kprobe_blacklist(mod);
@@ -2583,11 +2456,8 @@ static int __init init_kprobes(void)
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
- for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&kprobe_table[i]);
- INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
- raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
- }
err = populate_kprobe_blacklist(__start_kprobe_blacklist,
__stop_kprobe_blacklist);
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 239039d0ce21..97fb6f3f840a 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,13 +56,11 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
-LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */
-LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */
LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
-LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */
-LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */
LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
+LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */
LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 62d215b2e39f..fd838cea3934 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -29,6 +29,7 @@
#include <linux/slab.h>
#include <linux/percpu-rwsem.h>
#include <linux/torture.h>
+#include <linux/reboot.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
@@ -60,6 +61,7 @@ static struct task_struct **reader_tasks;
static bool lock_is_write_held;
static bool lock_is_read_held;
+static unsigned long last_lock_release;
struct lock_stress_stats {
long n_lock_fail;
@@ -74,6 +76,7 @@ static void lock_torture_cleanup(void);
*/
struct lock_torture_ops {
void (*init)(void);
+ void (*exit)(void);
int (*writelock)(void);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
@@ -90,12 +93,13 @@ struct lock_torture_cxt {
int nrealwriters_stress;
int nrealreaders_stress;
bool debug_lock;
+ bool init_called;
atomic_t n_lock_torture_errors;
struct lock_torture_ops *cur_ops;
struct lock_stress_stats *lwsa; /* writer statistics */
struct lock_stress_stats *lrsa; /* reader statistics */
};
-static struct lock_torture_cxt cxt = { 0, 0, false,
+static struct lock_torture_cxt cxt = { 0, 0, false, false,
ATOMIC_INIT(0),
NULL, NULL};
/*
@@ -571,6 +575,11 @@ static void torture_percpu_rwsem_init(void)
BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
}
+static void torture_percpu_rwsem_exit(void)
+{
+ percpu_free_rwsem(&pcpu_rwsem);
+}
+
static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem)
{
percpu_down_write(&pcpu_rwsem);
@@ -595,6 +604,7 @@ static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem)
static struct lock_torture_ops percpu_rwsem_lock_ops = {
.init = torture_percpu_rwsem_init,
+ .exit = torture_percpu_rwsem_exit,
.writelock = torture_percpu_rwsem_down_write,
.write_delay = torture_rwsem_write_delay,
.task_boost = torture_boost_dummy,
@@ -632,6 +642,7 @@ static int lock_torture_writer(void *arg)
lwsp->n_lock_acquired++;
cxt.cur_ops->write_delay(&rand);
lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
cxt.cur_ops->writeunlock();
stutter_wait("lock_torture_writer");
@@ -786,9 +797,10 @@ static void lock_torture_cleanup(void)
/*
* Indicates early cleanup, meaning that the test has not run,
- * such as when passing bogus args when loading the module. As
- * such, only perform the underlying torture-specific cleanups,
- * and avoid anything related to locktorture.
+ * such as when passing bogus args when loading the module.
+ * However cxt->cur_ops.init() may have been invoked, so beside
+ * perform the underlying torture-specific cleanups, cur_ops.exit()
+ * will be invoked if needed.
*/
if (!cxt.lwsa && !cxt.lrsa)
goto end;
@@ -828,6 +840,11 @@ static void lock_torture_cleanup(void)
cxt.lrsa = NULL;
end:
+ if (cxt.init_called) {
+ if (cxt.cur_ops->exit)
+ cxt.cur_ops->exit();
+ cxt.init_called = false;
+ }
torture_cleanup_end();
}
@@ -868,14 +885,17 @@ static int __init lock_torture_init(void)
goto unwind;
}
- if (nwriters_stress == 0 && nreaders_stress == 0) {
+ if (nwriters_stress == 0 &&
+ (!cxt.cur_ops->readlock || nreaders_stress == 0)) {
pr_alert("lock-torture: must run at least one locking thread\n");
firsterr = -EINVAL;
goto unwind;
}
- if (cxt.cur_ops->init)
+ if (cxt.cur_ops->init) {
cxt.cur_ops->init();
+ cxt.init_called = true;
+ }
if (nwriters_stress >= 0)
cxt.nrealwriters_stress = nwriters_stress;
@@ -1038,6 +1058,10 @@ static int __init lock_torture_init(void)
unwind:
torture_init_end();
lock_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f11b9bd3431d..ba67600c7b2c 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -31,19 +31,13 @@
#include "lock_events.h"
/*
- * The least significant 3 bits of the owner value has the following
+ * The least significant 2 bits of the owner value has the following
* meanings when set.
* - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
- * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
- * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
+ * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock
*
- * When the rwsem is either owned by an anonymous writer, or it is
- * reader-owned, but a spinning writer has timed out, both nonspinnable
- * bits will be set to disable optimistic spinning by readers and writers.
- * In the later case, the last unlocking reader should then check the
- * writer nonspinnable bit and clear it only to give writers preference
- * to acquire the lock via optimistic spinning, but not readers. Similar
- * action is also done in the reader slowpath.
+ * When the rwsem is reader-owned and a spinning writer has timed out,
+ * the nonspinnable bit will be set to disable optimistic spinning.
* When a writer acquires a rwsem, it puts its task_struct pointer
* into the owner field. It is cleared after an unlock.
@@ -59,46 +53,14 @@
* is involved. Ideally we would like to track all the readers that own
* a rwsem, but the overhead is simply too big.
*
- * Reader optimistic spinning is helpful when the reader critical section
- * is short and there aren't that many readers around. It makes readers
- * relatively more preferred than writers. When a writer times out spinning
- * on a reader-owned lock and set the nospinnable bits, there are two main
- * reasons for that.
- *
- * 1) The reader critical section is long, perhaps the task sleeps after
- * acquiring the read lock.
- * 2) There are just too many readers contending the lock causing it to
- * take a while to service all of them.
- *
- * In the former case, long reader critical section will impede the progress
- * of writers which is usually more important for system performance. In
- * the later case, reader optimistic spinning tends to make the reader
- * groups that contain readers that acquire the lock together smaller
- * leading to more of them. That may hurt performance in some cases. In
- * other words, the setting of nonspinnable bits indicates that reader
- * optimistic spinning may not be helpful for those workloads that cause
- * it.
- *
- * Therefore, any writers that had observed the setting of the writer
- * nonspinnable bit for a given rwsem after they fail to acquire the lock
- * via optimistic spinning will set the reader nonspinnable bit once they
- * acquire the write lock. Similarly, readers that observe the setting
- * of reader nonspinnable bit at slowpath entry will set the reader
- * nonspinnable bits when they acquire the read lock via the wakeup path.
- *
- * Once the reader nonspinnable bit is on, it will only be reset when
- * a writer is able to acquire the rwsem in the fast path or somehow a
- * reader or writer in the slowpath doesn't observe the nonspinable bit.
- *
- * This is to discourage reader optmistic spinning on that particular
- * rwsem and make writers more preferred. This adaptive disabling of reader
- * optimistic spinning will alleviate the negative side effect of this
- * feature.
+ * A fast path reader optimistic lock stealing is supported when the rwsem
+ * is previously owned by a writer and the following conditions are met:
+ * - OSQ is empty
+ * - rwsem is not currently writer owned
+ * - the handoff isn't set.
*/
#define RWSEM_READER_OWNED (1UL << 0)
-#define RWSEM_RD_NONSPINNABLE (1UL << 1)
-#define RWSEM_WR_NONSPINNABLE (1UL << 2)
-#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
+#define RWSEM_NONSPINNABLE (1UL << 1)
#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
#ifdef CONFIG_DEBUG_RWSEMS
@@ -203,7 +165,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
struct task_struct *owner)
{
unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
- (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE);
+ (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE);
atomic_long_set(&sem->owner, val);
}
@@ -270,12 +232,31 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
owner | RWSEM_NONSPINNABLE));
}
-static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp)
{
- long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
- if (WARN_ON_ONCE(cnt < 0))
+ *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+
+ if (WARN_ON_ONCE(*cntp < 0))
rwsem_set_nonspinnable(sem);
- return !(cnt & RWSEM_READ_FAILED_MASK);
+
+ if (!(*cntp & RWSEM_READ_FAILED_MASK)) {
+ rwsem_set_reader_owned(sem);
+ return true;
+ }
+
+ return false;
+}
+
+static inline bool rwsem_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+
+ return false;
}
/*
@@ -353,7 +334,6 @@ struct rwsem_waiter {
struct task_struct *task;
enum rwsem_waiter_type type;
unsigned long timeout;
- unsigned long last_rowner;
};
#define rwsem_first_waiter(sem) \
list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
@@ -467,10 +447,6 @@ static void rwsem_mark_wake(struct rw_semaphore *sem,
* the reader is copied over.
*/
owner = waiter->task;
- if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) {
- owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE);
- lockevent_inc(rwsem_opt_norspin);
- }
__rwsem_set_reader_owned(sem, owner);
}
@@ -602,30 +578,6 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
/*
- * Try to acquire read lock before the reader is put on wait queue.
- * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
- * is ongoing.
- */
-static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
-{
- long count = atomic_long_read(&sem->count);
-
- if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
- return false;
-
- count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
- if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
- rwsem_set_reader_owned(sem);
- lockevent_inc(rwsem_opt_rlock);
- return true;
- }
-
- /* Back out the change */
- atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
- return false;
-}
-
-/*
* Try to acquire write lock before the writer has been put on wait queue.
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
@@ -636,7 +588,7 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
count | RWSEM_WRITER_LOCKED)) {
rwsem_set_owner(sem);
- lockevent_inc(rwsem_opt_wlock);
+ lockevent_inc(rwsem_opt_lock);
return true;
}
}
@@ -652,8 +604,7 @@ static inline bool owner_on_cpu(struct task_struct *owner)
return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
}
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
- unsigned long nonspinnable)
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
unsigned long flags;
@@ -670,7 +621,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
/*
* Don't check the read-owner as the entry may be stale.
*/
- if ((flags & nonspinnable) ||
+ if ((flags & RWSEM_NONSPINNABLE) ||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
ret = false;
rcu_read_unlock();
@@ -700,9 +651,9 @@ enum owner_state {
#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
static inline enum owner_state
-rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
+rwsem_owner_state(struct task_struct *owner, unsigned long flags)
{
- if (flags & nonspinnable)
+ if (flags & RWSEM_NONSPINNABLE)
return OWNER_NONSPINNABLE;
if (flags & RWSEM_READER_OWNED)
@@ -712,14 +663,14 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long
}
static noinline enum owner_state
-rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+rwsem_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *new, *owner;
unsigned long flags, new_flags;
enum owner_state state;
owner = rwsem_owner_flags(sem, &flags);
- state = rwsem_owner_state(owner, flags, nonspinnable);
+ state = rwsem_owner_state(owner, flags);
if (state != OWNER_WRITER)
return state;
@@ -733,7 +684,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
*/
new = rwsem_owner_flags(sem, &new_flags);
if ((new != owner) || (new_flags != flags)) {
- state = rwsem_owner_state(new, new_flags, nonspinnable);
+ state = rwsem_owner_state(new, new_flags);
break;
}
@@ -782,14 +733,12 @@ static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
return sched_clock() + delta;
}
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
bool taken = false;
int prev_owner_state = OWNER_NULL;
int loop = 0;
u64 rspin_threshold = 0;
- unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
- : RWSEM_RD_NONSPINNABLE;
preempt_disable();
@@ -806,15 +755,14 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
for (;;) {
enum owner_state owner_state;
- owner_state = rwsem_spin_on_owner(sem, nonspinnable);
+ owner_state = rwsem_spin_on_owner(sem);
if (!(owner_state & OWNER_SPINNABLE))
break;
/*
* Try to acquire the lock
*/
- taken = wlock ? rwsem_try_write_lock_unqueued(sem)
- : rwsem_try_read_lock_unqueued(sem);
+ taken = rwsem_try_write_lock_unqueued(sem);
if (taken)
break;
@@ -822,7 +770,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
/*
* Time-based reader-owned rwsem optimistic spinning
*/
- if (wlock && (owner_state == OWNER_READER)) {
+ if (owner_state == OWNER_READER) {
/*
* Re-initialize rspin_threshold every time when
* the owner state changes from non-reader to reader.
@@ -831,7 +779,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
* the beginning of the 2nd reader phase.
*/
if (prev_owner_state != OWNER_READER) {
- if (rwsem_test_oflags(sem, nonspinnable))
+ if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
break;
rspin_threshold = rwsem_rspin_threshold(sem);
loop = 0;
@@ -907,78 +855,30 @@ done:
}
/*
- * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
+ * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should
* only be called when the reader count reaches 0.
- *
- * This give writers better chance to acquire the rwsem first before
- * readers when the rwsem was being held by readers for a relatively long
- * period of time. Race can happen that an optimistic spinner may have
- * just stolen the rwsem and set the owner, but just clearing the
- * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
*/
-static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
+static inline void clear_nonspinnable(struct rw_semaphore *sem)
{
- if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
- atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
+ if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE))
+ atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner);
}
-/*
- * This function is called when the reader fails to acquire the lock via
- * optimistic spinning. In this case we will still attempt to do a trylock
- * when comparing the rwsem state right now with the state when entering
- * the slowpath indicates that the reader is still in a valid reader phase.
- * This happens when the following conditions are true:
- *
- * 1) The lock is currently reader owned, and
- * 2) The lock is previously not reader-owned or the last read owner changes.
- *
- * In the former case, we have transitioned from a writer phase to a
- * reader-phase while spinning. In the latter case, it means the reader
- * phase hasn't ended when we entered the optimistic spinning loop. In
- * both cases, the reader is eligible to acquire the lock. This is the
- * secondary path where a read lock is acquired optimistically.
- *
- * The reader non-spinnable bit wasn't set at time of entry or it will
- * not be here at all.
- */
-static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
- unsigned long last_rowner)
-{
- unsigned long owner = atomic_long_read(&sem->owner);
-
- if (!(owner & RWSEM_READER_OWNED))
- return false;
-
- if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) &&
- rwsem_try_read_lock_unqueued(sem)) {
- lockevent_inc(rwsem_opt_rlock2);
- lockevent_add(rwsem_opt_fail, -1);
- return true;
- }
- return false;
-}
#else
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
- unsigned long nonspinnable)
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
return false;
}
-static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
return false;
}
-static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
-
-static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
- unsigned long last_rowner)
-{
- return false;
-}
+static inline void clear_nonspinnable(struct rw_semaphore *sem) { }
static inline int
-rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+rwsem_spin_on_owner(struct rw_semaphore *sem)
{
return 0;
}
@@ -989,36 +889,35 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
* Wait for the read lock to be granted
*/
static struct rw_semaphore __sched *
-rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
+rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state)
{
- long count, adjustment = -RWSEM_READER_BIAS;
+ long adjustment = -RWSEM_READER_BIAS;
+ long rcnt = (count >> RWSEM_READER_SHIFT);
struct rwsem_waiter waiter;
DEFINE_WAKE_Q(wake_q);
bool wake = false;
/*
- * Save the current read-owner of rwsem, if available, and the
- * reader nonspinnable bit.
+ * To prevent a constant stream of readers from starving a sleeping
+ * waiter, don't attempt optimistic lock stealing if the lock is
+ * currently owned by readers.
*/
- waiter.last_rowner = atomic_long_read(&sem->owner);
- if (!(waiter.last_rowner & RWSEM_READER_OWNED))
- waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
-
- if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
+ if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) &&
+ (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED))
goto queue;
/*
- * Undo read bias from down_read() and do optimistic spinning.
+ * Reader optimistic lock stealing.
*/
- atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
- adjustment = 0;
- if (rwsem_optimistic_spin(sem, false)) {
- /* rwsem_optimistic_spin() implies ACQUIRE on success */
+ if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) {
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_steal);
+
/*
- * Wake up other readers in the wait list if the front
- * waiter is a reader.
+ * Wake up other readers in the wait queue if it is
+ * the first reader.
*/
- if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
+ if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) {
raw_spin_lock_irq(&sem->wait_lock);
if (!list_empty(&sem->wait_list))
rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
@@ -1027,9 +926,6 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
wake_up_q(&wake_q);
}
return sem;
- } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
- /* rwsem_reader_phase_trylock() implies ACQUIRE on success */
- return sem;
}
queue:
@@ -1045,7 +941,7 @@ queue:
* exit the slowpath and return immediately as its
* RWSEM_READER_BIAS has already been set in the count.
*/
- if (adjustment && !(atomic_long_read(&sem->count) &
+ if (!(atomic_long_read(&sem->count) &
(RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
/* Provide lock ACQUIRE */
smp_acquire__after_ctrl_dep();
@@ -1059,10 +955,7 @@ queue:
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- if (adjustment)
- count = atomic_long_add_return(adjustment, &sem->count);
- else
- count = atomic_long_read(&sem->count);
+ count = atomic_long_add_return(adjustment, &sem->count);
/*
* If there are no active locks, wake the front queued process(es).
@@ -1071,7 +964,7 @@ queue:
* wake our own waiter to join the existing active readers !
*/
if (!(count & RWSEM_LOCK_MASK)) {
- clear_wr_nonspinnable(sem);
+ clear_nonspinnable(sem);
wake = true;
}
if (wake || (!(count & RWSEM_WRITER_MASK) &&
@@ -1117,46 +1010,24 @@ out_nolock:
}
/*
- * This function is called by the a write lock owner. So the owner value
- * won't get changed by others.
- */
-static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem,
- bool disable)
-{
- if (unlikely(disable)) {
- atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner);
- lockevent_inc(rwsem_opt_norspin);
- }
-}
-
-/*
* Wait until we successfully acquire the write lock
*/
static struct rw_semaphore *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
long count;
- bool disable_rspin;
enum writer_wait_state wstate;
struct rwsem_waiter waiter;
struct rw_semaphore *ret = sem;
DEFINE_WAKE_Q(wake_q);
/* do optimistic spinning and steal lock if possible */
- if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
- rwsem_optimistic_spin(sem, true)) {
+ if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) {
/* rwsem_optimistic_spin() implies ACQUIRE on success */
return sem;
}
/*
- * Disable reader optimistic spinning for this rwsem after
- * acquiring the write lock when the setting of the nonspinnable
- * bits are observed.
- */
- disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE;
-
- /*
* Optimistic spinning failed, proceed to the slowpath
* and block until we can acquire the sem.
*/
@@ -1224,7 +1095,7 @@ wait:
* without sleeping.
*/
if (wstate == WRITER_HANDOFF &&
- rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL)
+ rwsem_spin_on_owner(sem) == OWNER_NULL)
goto trylock_again;
/* Block until there are no active lockers. */
@@ -1266,7 +1137,6 @@ trylock_again:
}
__set_current_state(TASK_RUNNING);
list_del(&waiter.list);
- rwsem_disable_reader_optspin(sem, disable_rspin);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
@@ -1335,26 +1205,31 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
/*
* lock for reading
*/
-static inline void __down_read(struct rw_semaphore *sem)
+static inline int __down_read_common(struct rw_semaphore *sem, int state)
{
- if (!rwsem_read_trylock(sem)) {
- rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
+ long count;
+
+ if (!rwsem_read_trylock(sem, &count)) {
+ if (IS_ERR(rwsem_down_read_slowpath(sem, count, state)))
+ return -EINTR;
DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
- } else {
- rwsem_set_reader_owned(sem);
}
+ return 0;
+}
+
+static inline void __down_read(struct rw_semaphore *sem)
+{
+ __down_read_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+static inline int __down_read_interruptible(struct rw_semaphore *sem)
+{
+ return __down_read_common(sem, TASK_INTERRUPTIBLE);
}
static inline int __down_read_killable(struct rw_semaphore *sem)
{
- if (!rwsem_read_trylock(sem)) {
- if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
- return -EINTR;
- DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
- } else {
- rwsem_set_reader_owned(sem);
- }
- return 0;
+ return __down_read_common(sem, TASK_KILLABLE);
}
static inline int __down_read_trylock(struct rw_semaphore *sem)
@@ -1380,44 +1255,30 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
/*
* lock for writing
*/
-static inline void __down_write(struct rw_semaphore *sem)
+static inline int __down_write_common(struct rw_semaphore *sem, int state)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ if (unlikely(!rwsem_write_trylock(sem))) {
+ if (IS_ERR(rwsem_down_write_slowpath(sem, state)))
+ return -EINTR;
+ }
- if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- RWSEM_WRITER_LOCKED)))
- rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
- else
- rwsem_set_owner(sem);
+ return 0;
}
-static inline int __down_write_killable(struct rw_semaphore *sem)
+static inline void __down_write(struct rw_semaphore *sem)
{
- long tmp = RWSEM_UNLOCKED_VALUE;
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
- if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- RWSEM_WRITER_LOCKED))) {
- if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
- return -EINTR;
- } else {
- rwsem_set_owner(sem);
- }
- return 0;
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
}
static inline int __down_write_trylock(struct rw_semaphore *sem)
{
- long tmp;
-
DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem);
-
- tmp = RWSEM_UNLOCKED_VALUE;
- if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- RWSEM_WRITER_LOCKED)) {
- rwsem_set_owner(sem);
- return true;
- }
- return false;
+ return rwsem_write_trylock(sem);
}
/*
@@ -1435,7 +1296,7 @@ static inline void __up_read(struct rw_semaphore *sem)
DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
RWSEM_FLAG_WAITERS)) {
- clear_wr_nonspinnable(sem);
+ clear_nonspinnable(sem);
rwsem_wake(sem, tmp);
}
}
@@ -1495,6 +1356,20 @@ void __sched down_read(struct rw_semaphore *sem)
}
EXPORT_SYMBOL(down_read);
+int __sched down_read_interruptible(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_interruptible);
+
int __sched down_read_killable(struct rw_semaphore *sem)
{
might_sleep();
@@ -1605,6 +1480,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
}
EXPORT_SYMBOL(down_read_nested);
+int down_read_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, _RET_IP_);
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(down_read_killable_nested);
+
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
{
might_sleep();
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 12dd41b39a7f..abc01fcad8c7 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -153,7 +153,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
struct nsproxy *old_ns = tsk->nsproxy;
struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
struct nsproxy *new_ns;
- int ret;
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
@@ -173,18 +172,14 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
* it along with CLONE_NEWIPC.
*/
if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) ==
- (CLONE_NEWIPC | CLONE_SYSVSEM))
+ (CLONE_NEWIPC | CLONE_SYSVSEM))
return -EINVAL;
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- ret = timens_on_fork(new_ns, tsk);
- if (ret) {
- free_nsproxy(new_ns);
- return ret;
- }
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
@@ -250,8 +245,8 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
p->nsproxy = new;
task_unlock(p);
- if (ns && atomic_dec_and_test(&ns->count))
- free_nsproxy(ns);
+ if (ns)
+ put_nsproxy(ns);
}
void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/pid.c b/kernel/pid.c
index a96bc4bf4f86..47466d0bbc5b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -73,7 +73,7 @@ int pid_max_max = PID_MAX_LIMIT;
* the scheme scales to up to 4 million PIDs, runtime.
*/
struct pid_namespace init_pid_ns = {
- .kref = KREF_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.idr = IDR_INIT(init_pid_ns.idr),
.pid_allocated = PIDNS_ADDING,
.level = 0,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 9de21803a8ae..ca43239a255a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -102,7 +102,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
- kref_init(&ns->kref);
+ refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
@@ -148,22 +148,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
return create_pid_namespace(user_ns, old_ns);
}
-static void free_pid_ns(struct kref *kref)
-{
- struct pid_namespace *ns;
-
- ns = container_of(kref, struct pid_namespace, kref);
- destroy_pid_namespace(ns);
-}
-
void put_pid_ns(struct pid_namespace *ns)
{
struct pid_namespace *parent;
while (ns != &init_pid_ns) {
parent = ns->parent;
- if (!kref_put(&ns->kref, free_pid_ns))
+ if (!refcount_dec_and_test(&ns->ns.count))
break;
+ destroy_pid_namespace(ns);
ns = parent;
}
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9ef23d4b07c7..28713dda3f6b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -528,8 +528,8 @@ static int log_store(u32 caller_id, int facility, int level,
if (dev_info)
memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
- /* insert message */
- if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE))
+ /* A message without a trailing newline can be continued. */
+ if (!(flags & LOG_NEWLINE))
prb_commit(&e);
else
prb_final_commit(&e);
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 6b1525685277..74e25a1704f2 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -882,8 +882,6 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
do {
- desc = to_desc(desc_ring, head_id);
-
id = DESC_ID(head_id + 1);
id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 79de1294f8eb..add677d79fcf 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,9 +117,9 @@ void __ptrace_unlink(struct task_struct *child)
const struct cred *old_cred;
BUG_ON(!child->ptrace);
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(child, SYSCALL_TRACE);
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
+ clear_task_syscall_work(child, SYSCALL_EMU);
#endif
child->parent = child->real_parent;
@@ -806,15 +806,15 @@ static int ptrace_resume(struct task_struct *child, long request,
return -EIO;
if (request == PTRACE_SYSCALL)
- set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ set_task_syscall_work(child, SYSCALL_TRACE);
else
- clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+ clear_task_syscall_work(child, SYSCALL_TRACE);
-#ifdef TIF_SYSCALL_EMU
+#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
- set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ set_task_syscall_work(child, SYSCALL_EMU);
else
- clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+ clear_task_syscall_work(child, SYSCALL_EMU);
#endif
if (is_singleblock(request)) {
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index b71e21f73c40..cdc57b4f6d48 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -221,19 +221,23 @@ config RCU_NOCB_CPU
Use this option to reduce OS jitter for aggressive HPC or
real-time workloads. It can also be used to offload RCU
callback invocation to energy-efficient CPUs in battery-powered
- asymmetric multiprocessors.
+ asymmetric multiprocessors. The price of this reduced jitter
+ is that the overhead of call_rcu() increases and that some
+ workloads will incur significant increases in context-switch
+ rates.
This option offloads callback invocation from the set of CPUs
specified at boot time by the rcu_nocbs parameter. For each
such CPU, a kthread ("rcuox/N") will be created to invoke
callbacks, where the "N" is the CPU being offloaded, and where
- the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched
- (!PREEMPTION kernels). Nothing prevents this kthread from running
- on the specified CPUs, but (1) the kthreads may be preempted
- between each callback, and (2) affinity or cgroups can be used
- to force the kthreads to run on whatever set of CPUs is desired.
-
- Say Y here if you want to help to debug reduced OS jitter.
+ the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for
+ RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread
+ from running on the specified CPUs, but (1) the kthreads may be
+ preempted between each callback, and (2) affinity or cgroups can
+ be used to force the kthreads to run on whatever set of CPUs is
+ desired.
+
+ Say Y here if you need reduced OS jitter, despite added overhead.
Say N here if you are unsure.
config TASKS_TRACE_RCU_READ_MB
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index e01cba5e4b52..59ef1ae6dc37 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -533,4 +533,20 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
static inline void rcu_bind_current_to_nocb(void) { }
#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RCU)
+void show_rcu_tasks_classic_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_classic_gp_kthread(void) {}
+#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RUDE_RCU)
+void show_rcu_tasks_rude_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_rude_gp_kthread(void) {}
+#endif
+#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU)
+void show_rcu_tasks_trace_gp_kthread(void);
+#else
+static inline void show_rcu_tasks_trace_gp_kthread(void) {}
+#endif
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 5c293afc07b8..492262bcb591 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -62,7 +62,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp)
/* Is the specified rcu_segcblist offloaded? */
static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp)
{
- return rsclp->offloaded;
+ return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded;
}
/*
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 2819b95479af..06491d5530db 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -38,6 +38,7 @@
#include <asm/byteorder.h>
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include <linux/rcupdate_trace.h>
#include "rcu.h"
@@ -294,6 +295,35 @@ static struct rcu_scale_ops tasks_ops = {
.name = "tasks"
};
+/*
+ * Definitions for RCU-tasks-trace scalability testing.
+ */
+
+static int tasks_trace_scale_read_lock(void)
+{
+ rcu_read_lock_trace();
+ return 0;
+}
+
+static void tasks_trace_scale_read_unlock(int idx)
+{
+ rcu_read_unlock_trace();
+}
+
+static struct rcu_scale_ops tasks_tracing_ops = {
+ .ptype = RCU_TASKS_FLAVOR,
+ .init = rcu_sync_scale_init,
+ .readlock = tasks_trace_scale_read_lock,
+ .readunlock = tasks_trace_scale_read_unlock,
+ .get_gp_seq = rcu_no_completed,
+ .gp_diff = rcu_seq_diff,
+ .async = call_rcu_tasks_trace,
+ .gp_barrier = rcu_barrier_tasks_trace,
+ .sync = synchronize_rcu_tasks_trace,
+ .exp_sync = synchronize_rcu_tasks_trace,
+ .name = "tasks-tracing"
+};
+
static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -754,7 +784,7 @@ rcu_scale_init(void)
long i;
int firsterr = 0;
static struct rcu_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops,
+ &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops
};
if (!torture_init_begin(scale_type, verbose))
@@ -772,7 +802,6 @@ rcu_scale_init(void)
for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
pr_cont(" %s", scale_ops[i]->name);
pr_cont("\n");
- WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
firsterr = -EINVAL;
cur_ops = NULL;
goto unwind;
@@ -846,6 +875,10 @@ rcu_scale_init(void)
unwind:
torture_init_end();
rcu_scale_cleanup();
+ if (shutdown) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 916ea4f66e4b..528ed10b78fd 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -317,6 +317,7 @@ struct rcu_torture_ops {
void (*cb_barrier)(void);
void (*fqs)(void);
void (*stats)(void);
+ void (*gp_kthread_dbg)(void);
int (*stall_dur)(void);
int irq_capable;
int can_boost;
@@ -466,6 +467,7 @@ static struct rcu_torture_ops rcu_ops = {
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
.stats = NULL,
+ .gp_kthread_dbg = show_rcu_gp_kthreads,
.stall_dur = rcu_jiffies_till_stall_check,
.irq_capable = 1,
.can_boost = rcu_can_boost(),
@@ -693,6 +695,7 @@ static struct rcu_torture_ops tasks_ops = {
.exp_sync = synchronize_rcu_mult_test,
.call = call_rcu_tasks,
.cb_barrier = rcu_barrier_tasks,
+ .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -762,6 +765,7 @@ static struct rcu_torture_ops tasks_rude_ops = {
.exp_sync = synchronize_rcu_tasks_rude,
.call = call_rcu_tasks_rude,
.cb_barrier = rcu_barrier_tasks_rude,
+ .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -800,6 +804,7 @@ static struct rcu_torture_ops tasks_tracing_ops = {
.exp_sync = synchronize_rcu_tasks_trace,
.call = call_rcu_tasks_trace,
.cb_barrier = rcu_barrier_tasks_trace,
+ .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread,
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
@@ -912,7 +917,8 @@ static int rcu_torture_boost(void *arg)
oldstarttime = boost_starttime;
while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
- stutter_wait("rcu_torture_boost");
+ if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
if (torture_must_stop())
goto checkwait;
}
@@ -932,7 +938,8 @@ static int rcu_torture_boost(void *arg)
jiffies);
call_rcu_time = jiffies;
}
- stutter_wait("rcu_torture_boost");
+ if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
if (torture_must_stop())
goto checkwait;
}
@@ -964,7 +971,8 @@ static int rcu_torture_boost(void *arg)
}
/* Go do the stutter. */
-checkwait: stutter_wait("rcu_torture_boost");
+checkwait: if (stutter_wait("rcu_torture_boost"))
+ sched_set_fifo_low(current);
} while (!torture_must_stop());
/* Clean up and exit. */
@@ -987,6 +995,7 @@ rcu_torture_fqs(void *arg)
{
unsigned long fqs_resume_time;
int fqs_burst_remaining;
+ int oldnice = task_nice(current);
VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
do {
@@ -1002,7 +1011,8 @@ rcu_torture_fqs(void *arg)
udelay(fqs_holdoff);
fqs_burst_remaining -= fqs_holdoff;
}
- stutter_wait("rcu_torture_fqs");
+ if (stutter_wait("rcu_torture_fqs"))
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
torture_kthread_stopping("rcu_torture_fqs");
return 0;
@@ -1022,9 +1032,11 @@ rcu_torture_writer(void *arg)
bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
bool gp_sync1 = gp_sync;
int i;
+ int oldnice = task_nice(current);
struct rcu_torture *rp;
struct rcu_torture *old_rp;
static DEFINE_TORTURE_RANDOM(rand);
+ bool stutter_waited;
int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC,
RTWS_COND_GET, RTWS_SYNC };
int nsynctypes = 0;
@@ -1143,7 +1155,8 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
- if (stutter_wait("rcu_torture_writer") &&
+ stutter_waited = stutter_wait("rcu_torture_writer");
+ if (stutter_waited &&
!READ_ONCE(rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
@@ -1155,6 +1168,8 @@ rcu_torture_writer(void *arg)
rcu_ftrace_dump(DUMP_ALL);
WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
}
+ if (stutter_waited)
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
rcu_torture_current = NULL; // Let stats task know that we are done.
/* Reset expediting back to unexpedited. */
@@ -1594,7 +1609,8 @@ rcu_torture_stats_print(void)
sched_show_task(wtp);
splatted = true;
}
- show_rcu_gp_kthreads();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
rcu_ftrace_dump(DUMP_ALL);
}
rtcv_snap = rcu_torture_current_version;
@@ -1913,7 +1929,9 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
unsigned long stopat;
static DEFINE_TORTURE_RANDOM(trs);
- if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) {
+ if (!cur_ops->sync)
+ return; // Cannot do need_resched() forward progress testing without ->sync.
+ if (cur_ops->call && cur_ops->cb_barrier) {
init_rcu_head_on_stack(&fcs.rh);
selfpropcb = true;
}
@@ -2103,6 +2121,7 @@ static struct notifier_block rcutorture_oom_nb = {
/* Carry out grace-period forward-progress testing. */
static int rcu_torture_fwd_prog(void *args)
{
+ int oldnice = task_nice(current);
struct rcu_fwd *rfp = args;
int tested = 0;
int tested_tries = 0;
@@ -2121,7 +2140,8 @@ static int rcu_torture_fwd_prog(void *args)
rcu_torture_fwd_prog_cr(rfp);
/* Avoid slow periods, better to test when busy. */
- stutter_wait("rcu_torture_fwd_prog");
+ if (stutter_wait("rcu_torture_fwd_prog"))
+ sched_set_normal(current, oldnice);
} while (!torture_must_stop());
/* Short runs might not contain a valid forward-progress attempt. */
WARN_ON(!tested && tested_tries >= 5);
@@ -2137,8 +2157,8 @@ static int __init rcu_torture_fwd_prog_init(void)
if (!fwd_progress)
return 0; /* Not requested, so don't do it. */
- if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 ||
- cur_ops == &rcu_busted_ops) {
+ if ((!cur_ops->sync && !cur_ops->call) ||
+ !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) {
VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test");
return 0;
}
@@ -2472,7 +2492,8 @@ rcu_torture_cleanup(void)
return;
}
- show_rcu_gp_kthreads();
+ if (cur_ops->gp_kthread_dbg)
+ cur_ops->gp_kthread_dbg();
rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
rcu_torture_fwd_prog_cleanup();
@@ -2484,13 +2505,13 @@ rcu_torture_cleanup(void)
torture_stop_kthread(rcu_torture_reader,
reader_tasks[i]);
kfree(reader_tasks);
+ reader_tasks = NULL;
}
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
- }
kfree(fakewriter_tasks);
fakewriter_tasks = NULL;
}
@@ -2647,7 +2668,6 @@ rcu_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_cont(" %s", torture_ops[i]->name);
pr_cont("\n");
- WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
firsterr = -EINVAL;
cur_ops = NULL;
goto unwind;
@@ -2815,6 +2835,10 @@ rcu_torture_init(void)
unwind:
torture_init_end();
rcu_torture_cleanup();
+ if (shutdown_secs) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 952595c678b3..23ff36a66f97 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -658,7 +658,6 @@ ref_scale_init(void)
for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
pr_cont(" %s", scale_ops[i]->name);
pr_cont("\n");
- WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
firsterr = -EINVAL;
cur_ops = NULL;
goto unwind;
@@ -681,6 +680,12 @@ ref_scale_init(void)
// Reader tasks (default to ~75% of online CPUs).
if (nreaders < 0)
nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+ if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops))
+ loops = 1;
+ if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders))
+ nreaders = 1;
+ if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns))
+ nruns = 1;
reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
GFP_KERNEL);
if (!reader_tasks) {
@@ -712,6 +717,10 @@ ref_scale_init(void)
unwind:
torture_init_end();
ref_scale_cleanup();
+ if (shutdown) {
+ WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
+ kernel_power_off();
+ }
return firsterr;
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c13348ee80a5..0f23d20d485a 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -177,11 +177,13 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
INIT_DELAYED_WORK(&ssp->work, process_srcu);
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
+ if (!ssp->sda)
+ return -ENOMEM;
init_srcu_struct_nodes(ssp, is_static);
ssp->srcu_gp_seq_needed_exp = 0;
ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
- return ssp->sda ? 0 : -ENOMEM;
+ return 0;
}
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -906,7 +908,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
- RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) ||
+ RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
lock_is_held(&rcu_sched_lock_map),
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d5d9f2d03e8a..35bdcfd84d42 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -290,7 +290,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
".C"[!!data_race(rtp->cbs_head)],
s);
}
-#endif /* #ifndef CONFIG_TINY_RCU */
+#endif // #ifndef CONFIG_TINY_RCU
static void exit_tasks_rcu_finish_trace(struct task_struct *t);
@@ -335,23 +335,18 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// Start off with initial wait and slowly back off to 1 HZ wait.
fract = rtp->init_fract;
- if (fract > HZ)
- fract = HZ;
- for (;;) {
+ while (!list_empty(&holdouts)) {
bool firstreport;
bool needreport;
int rtst;
- if (list_empty(&holdouts))
- break;
-
/* Slowly back off waiting for holdouts */
set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
- schedule_timeout_idle(HZ/fract);
+ schedule_timeout_idle(fract);
- if (fract > 1)
- fract--;
+ if (fract < HZ)
+ fract++;
rtst = READ_ONCE(rcu_task_stall_timeout);
needreport = rtst > 0 && time_after(jiffies, lastreport + rtst);
@@ -560,7 +555,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
static int __init rcu_spawn_tasks_kthread(void)
{
rcu_tasks.gp_sleep = HZ / 10;
- rcu_tasks.init_fract = 10;
+ rcu_tasks.init_fract = HZ / 10;
rcu_tasks.pregp_func = rcu_tasks_pregp_step;
rcu_tasks.pertask_func = rcu_tasks_pertask;
rcu_tasks.postscan_func = rcu_tasks_postscan;
@@ -571,12 +566,13 @@ static int __init rcu_spawn_tasks_kthread(void)
}
core_initcall(rcu_spawn_tasks_kthread);
-#ifndef CONFIG_TINY_RCU
-static void show_rcu_tasks_classic_gp_kthread(void)
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_classic_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
}
-#endif /* #ifndef CONFIG_TINY_RCU */
+EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread);
+#endif // !defined(CONFIG_TINY_RCU)
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
@@ -598,7 +594,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
}
#else /* #ifdef CONFIG_TASKS_RCU */
-static inline void show_rcu_tasks_classic_gp_kthread(void) { }
void exit_tasks_rcu_start(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -699,16 +694,14 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
}
core_initcall(rcu_spawn_tasks_rude_kthread);
-#ifndef CONFIG_TINY_RCU
-static void show_rcu_tasks_rude_gp_kthread(void)
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_rude_gp_kthread(void)
{
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
}
-#endif /* #ifndef CONFIG_TINY_RCU */
-
-#else /* #ifdef CONFIG_TASKS_RUDE_RCU */
-static void show_rcu_tasks_rude_gp_kthread(void) {}
-#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */
+EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
+#endif // !defined(CONFIG_TINY_RCU)
+#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */
////////////////////////////////////////////////////////////////////////
//
@@ -1183,12 +1176,12 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
{
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) {
rcu_tasks_trace.gp_sleep = HZ / 10;
- rcu_tasks_trace.init_fract = 10;
+ rcu_tasks_trace.init_fract = HZ / 10;
} else {
rcu_tasks_trace.gp_sleep = HZ / 200;
if (rcu_tasks_trace.gp_sleep <= 0)
rcu_tasks_trace.gp_sleep = 1;
- rcu_tasks_trace.init_fract = HZ / 5;
+ rcu_tasks_trace.init_fract = HZ / 200;
if (rcu_tasks_trace.init_fract <= 0)
rcu_tasks_trace.init_fract = 1;
}
@@ -1202,8 +1195,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
}
core_initcall(rcu_spawn_tasks_trace_kthread);
-#ifndef CONFIG_TINY_RCU
-static void show_rcu_tasks_trace_gp_kthread(void)
+#if !defined(CONFIG_TINY_RCU)
+void show_rcu_tasks_trace_gp_kthread(void)
{
char buf[64];
@@ -1213,11 +1206,11 @@ static void show_rcu_tasks_trace_gp_kthread(void)
data_race(n_heavy_reader_attempts));
show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
}
-#endif /* #ifndef CONFIG_TINY_RCU */
+EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread);
+#endif // !defined(CONFIG_TINY_RCU)
#else /* #ifdef CONFIG_TASKS_TRACE_RCU */
static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
-static inline void show_rcu_tasks_trace_gp_kthread(void) {}
#endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ed4941f0bd59..40e5e3dd253e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444);
* per-CPU. Object size is equal to one page. This value
* can be changed at boot time.
*/
-static int rcu_min_cached_objs = 2;
+static int rcu_min_cached_objs = 5;
module_param(rcu_min_cached_objs, int, 0444);
/* Retrieve RCU kthreads priority for rcutorture */
@@ -341,6 +341,14 @@ static bool rcu_dynticks_in_eqs(int snap)
return !(snap & RCU_DYNTICK_CTRL_CTR);
}
+/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
+bool rcu_is_idle_cpu(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp));
+}
+
/*
* Return true if the CPU corresponding to the specified rcu_data
* structure has spent some time in an extended quiescent state since
@@ -546,12 +554,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param
return ret;
}
-static struct kernel_param_ops first_fqs_jiffies_ops = {
+static const struct kernel_param_ops first_fqs_jiffies_ops = {
.set = param_set_first_fqs_jiffies,
.get = param_get_ulong,
};
-static struct kernel_param_ops next_fqs_jiffies_ops = {
+static const struct kernel_param_ops next_fqs_jiffies_ops = {
.set = param_set_next_fqs_jiffies,
.get = param_get_ulong,
};
@@ -928,8 +936,8 @@ void __rcu_irq_enter_check_tick(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- // Enabling the tick is unsafe in NMI handlers.
- if (WARN_ON_ONCE(in_nmi()))
+ // If we're here from NMI there's nothing to do.
+ if (in_nmi())
return;
RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(),
@@ -1093,8 +1101,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
* CPU can safely enter RCU read-side critical sections. In other words,
* if the current CPU is not in its idle loop or is in an interrupt or
* NMI handler, return true.
+ *
+ * Make notrace because it can be called by the internal functions of
+ * ftrace, and making this notrace removes unnecessary recursion calls.
*/
-bool rcu_is_watching(void)
+notrace bool rcu_is_watching(void)
{
bool ret;
@@ -1149,7 +1160,7 @@ bool rcu_lockdep_current_cpu_online(void)
preempt_disable_notrace();
rdp = this_cpu_ptr(&rcu_data);
rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp))
+ if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
ret = true;
preempt_enable_notrace();
return ret;
@@ -1601,8 +1612,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
bool need_qs;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
raw_lockdep_assert_held_rcu_node(rnp);
@@ -1713,6 +1723,7 @@ static void rcu_strict_gp_boundary(void *unused)
*/
static bool rcu_gp_init(void)
{
+ unsigned long firstseq;
unsigned long flags;
unsigned long oldmask;
unsigned long mask;
@@ -1756,6 +1767,12 @@ static bool rcu_gp_init(void)
*/
rcu_state.gp_state = RCU_GP_ONOFF;
rcu_for_each_leaf_node(rnp) {
+ smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
+ firstseq = READ_ONCE(rnp->ofl_seq);
+ if (firstseq & 0x1)
+ while (firstseq == READ_ONCE(rnp->ofl_seq))
+ schedule_timeout_idle(1); // Can't wake unless RCU is watching.
+ smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
raw_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irq_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -2046,8 +2063,7 @@ static void rcu_gp_cleanup(void)
needgp = true;
}
/* Advance CBs to reduce false positives below. */
- offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
@@ -2246,8 +2262,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
unsigned long flags;
unsigned long mask;
bool needwake = false;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_node *rnp;
WARN_ON_ONCE(rdp->cpu != smp_processor_id());
@@ -2397,6 +2412,7 @@ int rcutree_dead_cpu(unsigned int cpu)
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return 0;
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
/* Do any needed no-CB deferred wakeups from this CPU. */
@@ -2415,8 +2431,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
{
int div;
unsigned long flags;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count;
@@ -2673,8 +2688,7 @@ static __latent_entropy void rcu_core(void)
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
struct rcu_node *rnp = rdp->mynode;
- const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- rcu_segcblist_is_offloaded(&rdp->cblist);
+ const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist);
if (cpu_is_offline(smp_processor_id()))
return;
@@ -2976,8 +2990,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
- if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
- unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
+ if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) {
__call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */
} else {
__call_rcu_core(rdp, head, flags);
@@ -3082,6 +3095,9 @@ struct kfree_rcu_cpu_work {
* In order to save some per-cpu space the list is singular.
* Even though it is lockless an access has to be protected by the
* per-cpu lock.
+ * @page_cache_work: A work to refill the cache when it is empty
+ * @work_in_progress: Indicates that page_cache_work is running
+ * @hrtimer: A hrtimer for scheduling a page_cache_work
* @nr_bkv_objs: number of allocated objects at @bkvcache.
*
* This is a per-CPU structure. The reason that it is not included in
@@ -3098,6 +3114,11 @@ struct kfree_rcu_cpu {
bool monitor_todo;
bool initialized;
int count;
+
+ struct work_struct page_cache_work;
+ atomic_t work_in_progress;
+ struct hrtimer hrtimer;
+
struct llist_head bkvcache;
int nr_bkv_objs;
};
@@ -3215,10 +3236,10 @@ static void kfree_rcu_work(struct work_struct *work)
}
rcu_lock_release(&rcu_callback_map);
- krcp = krc_this_cpu_lock(&flags);
+ raw_spin_lock_irqsave(&krcp->lock, flags);
if (put_cached_bnode(krcp, bkvhead[i]))
bkvhead[i] = NULL;
- krc_this_cpu_unlock(krcp, flags);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
if (bkvhead[i])
free_page((unsigned long) bkvhead[i]);
@@ -3345,6 +3366,57 @@ static void kfree_rcu_monitor(struct work_struct *work)
raw_spin_unlock_irqrestore(&krcp->lock, flags);
}
+static enum hrtimer_restart
+schedule_page_work_fn(struct hrtimer *t)
+{
+ struct kfree_rcu_cpu *krcp =
+ container_of(t, struct kfree_rcu_cpu, hrtimer);
+
+ queue_work(system_highpri_wq, &krcp->page_cache_work);
+ return HRTIMER_NORESTART;
+}
+
+static void fill_page_cache_func(struct work_struct *work)
+{
+ struct kvfree_rcu_bulk_data *bnode;
+ struct kfree_rcu_cpu *krcp =
+ container_of(work, struct kfree_rcu_cpu,
+ page_cache_work);
+ unsigned long flags;
+ bool pushed;
+ int i;
+
+ for (i = 0; i < rcu_min_cached_objs; i++) {
+ bnode = (struct kvfree_rcu_bulk_data *)
+ __get_free_page(GFP_KERNEL | __GFP_NOWARN);
+
+ if (bnode) {
+ raw_spin_lock_irqsave(&krcp->lock, flags);
+ pushed = put_cached_bnode(krcp, bnode);
+ raw_spin_unlock_irqrestore(&krcp->lock, flags);
+
+ if (!pushed) {
+ free_page((unsigned long) bnode);
+ break;
+ }
+ }
+ }
+
+ atomic_set(&krcp->work_in_progress, 0);
+}
+
+static void
+run_page_cache_worker(struct kfree_rcu_cpu *krcp)
+{
+ if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
+ !atomic_xchg(&krcp->work_in_progress, 1)) {
+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ krcp->hrtimer.function = schedule_page_work_fn;
+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
+ }
+}
+
static inline bool
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
{
@@ -3361,32 +3433,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
if (!krcp->bkvhead[idx] ||
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
bnode = get_cached_bnode(krcp);
- if (!bnode) {
- /*
- * To keep this path working on raw non-preemptible
- * sections, prevent the optional entry into the
- * allocator as it uses sleeping locks. In fact, even
- * if the caller of kfree_rcu() is preemptible, this
- * path still is not, as krcp->lock is a raw spinlock.
- * With additional page pre-allocation in the works,
- * hitting this return is going to be much less likely.
- */
- if (IS_ENABLED(CONFIG_PREEMPT_RT))
- return false;
-
- /*
- * NOTE: For one argument of kvfree_rcu() we can
- * drop the lock and get the page in sleepable
- * context. That would allow to maintain an array
- * for the CONFIG_PREEMPT_RT as well if no cached
- * pages are available.
- */
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
- }
-
/* Switch to emergency path. */
- if (unlikely(!bnode))
+ if (!bnode)
return false;
/* Initialize the new block. */
@@ -3450,12 +3498,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
goto unlock_return;
}
- /*
- * Under high memory pressure GFP_NOWAIT can fail,
- * in that case the emergency path is maintained.
- */
success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
if (!success) {
+ run_page_cache_worker(krcp);
+
if (head == NULL)
// Inline if kvfree_rcu(one_arg) call.
goto unlock_return;
@@ -3565,7 +3611,7 @@ void __init kfree_rcu_scheduler_running(void)
* During early boot, any blocking grace-period wait automatically
* implies a grace period. Later on, this is never the case for PREEMPTION.
*
- * Howevr, because a context switch is a grace period for !PREEMPTION, any
+ * However, because a context switch is a grace period for !PREEMPTION, any
* blocking grace-period wait automatically implies a grace period if
* there is only one CPU online at any point time during execution of
* either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to
@@ -3581,7 +3627,20 @@ static int rcu_blocking_is_gp(void)
return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
might_sleep(); /* Check for RCU read-side critical section. */
preempt_disable();
- ret = num_online_cpus() <= 1;
+ /*
+ * If the rcu_state.n_online_cpus counter is equal to one,
+ * there is only one CPU, and that CPU sees all prior accesses
+ * made by any CPU that was online at the time of its access.
+ * Furthermore, if this counter is equal to one, its value cannot
+ * change until after the preempt_enable() below.
+ *
+ * Furthermore, if rcu_state.n_online_cpus is equal to one here,
+ * all later CPUs (both this one and any that come online later
+ * on) are guaranteed to see all accesses prior to this point
+ * in the code, without the need for additional memory barriers.
+ * Those memory barriers are provided by CPU-hotplug code.
+ */
+ ret = READ_ONCE(rcu_state.n_online_cpus) <= 1;
preempt_enable();
return ret;
}
@@ -3626,7 +3685,7 @@ void synchronize_rcu(void)
lock_is_held(&rcu_sched_lock_map),
"Illegal synchronize_rcu() in RCU read-side critical section");
if (rcu_blocking_is_gp())
- return;
+ return; // Context allows vacuous grace periods.
if (rcu_gp_is_expedited())
synchronize_rcu_expedited();
else
@@ -3705,13 +3764,13 @@ static int rcu_pending(int user)
return 1;
/* Does this CPU have callbacks ready to invoke? */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
+ if (!rcu_segcblist_is_offloaded(&rdp->cblist) &&
+ rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) &&
- (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) ||
- !rcu_segcblist_is_offloaded(&rdp->cblist)) &&
+ !rcu_segcblist_is_offloaded(&rdp->cblist) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
@@ -3968,6 +4027,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rcu_prepare_kthreads(cpu);
rcu_spawn_cpu_nocb_kthread(cpu);
+ WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1);
return 0;
}
@@ -4056,6 +4116,9 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
newcpu = !(rnp->expmaskinitnext & mask);
@@ -4066,13 +4129,18 @@ void rcu_cpu_starting(unsigned int cpu)
rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags);
- if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */
+
+ /* An incoming CPU should never be blocking a grace period. */
+ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
} else {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(rnp->ofl_seq & 0x1);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
@@ -4099,6 +4167,9 @@ void rcu_report_dead(unsigned int cpu)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
raw_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
@@ -4111,6 +4182,9 @@ void rcu_report_dead(unsigned int cpu)
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
raw_spin_unlock(&rcu_state.ofl_lock);
+ smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
+ WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
+ WARN_ON_ONCE(rnp->ofl_seq & 0x1);
rdp->cpu_started = false;
}
@@ -4448,24 +4522,14 @@ static void __init kfree_rcu_batch_init(void)
for_each_possible_cpu(cpu) {
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
- struct kvfree_rcu_bulk_data *bnode;
for (i = 0; i < KFREE_N_BATCHES; i++) {
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
krcp->krw_arr[i].krcp = krcp;
}
- for (i = 0; i < rcu_min_cached_objs; i++) {
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
-
- if (bnode)
- put_cached_bnode(krcp, bnode);
- else
- pr_err("Failed to preallocate for %d CPU!\n", cpu);
- }
-
INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
+ INIT_WORK(&krcp->page_cache_work, fill_page_cache_func);
krcp->initialized = true;
}
if (register_shrinker(&kfree_rcu_shrinker))
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e4f66b8f7c47..7708ed161f4a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,6 +56,7 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
+ unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
/* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
@@ -298,6 +299,7 @@ struct rcu_state {
/* Hierarchy levels (+1 to */
/* shut bogus gcc warning) */
int ncpus; /* # CPUs seen so far. */
+ int n_online_cpus; /* # CPUs online for RCU. */
/* The following fields are guarded by the root rcu_node's lock. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fd8a52e9a887..7e291ce0a1d6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -628,7 +628,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
set_tsk_need_resched(current);
set_preempt_need_resched();
if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
- !rdp->defer_qs_iw_pending && exp) {
+ !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) {
// Get scheduler to re-evaluate and call hooks.
// If !IRQ_WORK, FQS scan will eventually IPI.
init_irq_work(&rdp->defer_qs_iw,
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index ca21d28a0f98..70d48c52fabc 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -13,6 +13,7 @@
/* panic() on RCU Stall sysctl. */
int sysctl_panic_on_rcu_stall __read_mostly;
+int sysctl_max_rcu_stall_to_panic __read_mostly;
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -106,6 +107,11 @@ early_initcall(check_cpu_stall_init);
/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */
static void panic_on_rcu_stall(void)
{
+ static int cpu_stall;
+
+ if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
+ return;
+
if (sysctl_panic_on_rcu_stall)
panic("RCU Stall\n");
}
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 554a521ee235..d55a9f8cda3d 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -59,9 +59,10 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable.");
torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s.");
-torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug.");
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations.");
torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
@@ -82,6 +83,7 @@ torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test
struct scf_statistics {
struct task_struct *task;
int cpu;
+ long long n_resched;
long long n_single;
long long n_single_ofl;
long long n_single_wait;
@@ -97,12 +99,15 @@ static struct task_struct *scf_torture_stats_task;
static DEFINE_PER_CPU(long long, scf_invoked_count);
// Data for random primitive selection
-#define SCF_PRIM_SINGLE 0
-#define SCF_PRIM_MANY 1
-#define SCF_PRIM_ALL 2
-#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each.
+#define SCF_PRIM_RESCHED 0
+#define SCF_PRIM_SINGLE 1
+#define SCF_PRIM_MANY 2
+#define SCF_PRIM_ALL 3
+#define SCF_NPRIMS 7 // Need wait and no-wait versions of each,
+ // except for SCF_PRIM_RESCHED.
static char *scf_prim_name[] = {
+ "resched_cpu",
"smp_call_function_single",
"smp_call_function_many",
"smp_call_function",
@@ -136,6 +141,8 @@ static char *bangstr = "";
static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
+extern void resched_cpu(int cpu); // An alternative IPI vector.
+
// Print torture statistics. Caller must ensure serialization.
static void scf_torture_stats_print(void)
{
@@ -148,6 +155,7 @@ static void scf_torture_stats_print(void)
for_each_possible_cpu(cpu)
invoked_count += data_race(per_cpu(scf_invoked_count, cpu));
for (i = 0; i < nthreads; i++) {
+ scfs.n_resched += scf_stats_p[i].n_resched;
scfs.n_single += scf_stats_p[i].n_single;
scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
scfs.n_single_wait += scf_stats_p[i].n_single_wait;
@@ -160,8 +168,8 @@ static void scf_torture_stats_print(void)
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
bangstr = "!!! ";
- pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
- SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count,
+ pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
+ SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
torture_onoff_stats();
@@ -314,6 +322,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
}
}
switch (scfsp->scfs_prim) {
+ case SCF_PRIM_RESCHED:
+ if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) {
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_resched++;
+ resched_cpu(cpu);
+ }
+ break;
case SCF_PRIM_SINGLE:
cpu = torture_random(trsp) % nr_cpu_ids;
if (scfsp->scfs_wait)
@@ -421,6 +436,7 @@ static int scftorture_invoker(void *arg)
was_offline = false;
}
cond_resched();
+ stutter_wait("scftorture_invoker");
} while (!torture_must_stop());
VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu);
@@ -433,8 +449,8 @@ static void
scftorture_print_module_parms(const char *tag)
{
pr_alert(SCFTORT_FLAG
- "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
- verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
+ "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+ verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
}
static void scf_cleanup_handler(void *unused)
@@ -475,6 +491,7 @@ static int __init scf_torture_init(void)
{
long i;
int firsterr = 0;
+ unsigned long weight_resched1 = weight_resched;
unsigned long weight_single1 = weight_single;
unsigned long weight_single_wait1 = weight_single_wait;
unsigned long weight_many1 = weight_many;
@@ -487,9 +504,10 @@ static int __init scf_torture_init(void)
scftorture_print_module_parms("Start of test");
- if (weight_single == -1 && weight_single_wait == -1 &&
+ if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 &&
weight_many == -1 && weight_many_wait == -1 &&
weight_all == -1 && weight_all_wait == -1) {
+ weight_resched1 = 2 * nr_cpu_ids;
weight_single1 = 2 * nr_cpu_ids;
weight_single_wait1 = 2 * nr_cpu_ids;
weight_many1 = 2;
@@ -497,6 +515,8 @@ static int __init scf_torture_init(void)
weight_all1 = 1;
weight_all_wait1 = 1;
} else {
+ if (weight_resched == -1)
+ weight_resched1 = 0;
if (weight_single == -1)
weight_single1 = 0;
if (weight_single_wait == -1)
@@ -517,6 +537,10 @@ static int __init scf_torture_init(void)
firsterr = -EINVAL;
goto unwind;
}
+ if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST))
+ scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false);
+ else if (weight_resched1)
+ VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
@@ -535,6 +559,11 @@ static int __init scf_torture_init(void)
if (firsterr)
goto unwind;
}
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter, stutter);
+ if (firsterr)
+ goto unwind;
+ }
// Worker tasks invoking smp_call_function().
if (nthreads < 0)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7af80c3fce12..3636b80222ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4819,6 +4819,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
+ SCHED_WARN_ON(ct_state() == CONTEXT_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -5160,7 +5161,7 @@ void __sched schedule_idle(void)
} while (need_resched());
}
-#ifdef CONFIG_CONTEXT_TRACKING
+#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK)
asmlinkage __visible void __sched schedule_user(void)
{
/*
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index df91b198a74c..305727ea0677 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
- local_irq_enable();
+ raw_local_irq_enable();
}
/**
@@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void)
trace_cpu_idle(1, smp_processor_id());
stop_critical_timings();
+
+ /*
+ * arch_cpu_idle() is supposed to enable IRQs, however
+ * we can't do that because of RCU and tracing.
+ *
+ * Trace IRQs enable here, then switch off RCU, and have
+ * arch_cpu_idle() use raw_local_irq_enable(). Note that
+ * rcu_idle_enter() relies on lockdep IRQ state, so switch that
+ * last -- this is very similar to the entry code.
+ */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(_THIS_IP_);
rcu_idle_enter();
+ lockdep_hardirqs_on(_THIS_IP_);
+
arch_cpu_idle();
+
+ /*
+ * OK, so IRQs are enabled here, but RCU needs them disabled to
+ * turn itself back on.. funny thing is that disabling IRQs
+ * will cause tracing, which needs RCU. Jump through hoops to
+ * make it 'work'.
+ */
+ raw_local_irq_disable();
+ lockdep_hardirqs_off(_THIS_IP_);
rcu_idle_exit();
+ lockdep_hardirqs_on(_THIS_IP_);
+ raw_local_irq_enable();
+
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5a40b3828ff2..08ae45ad9261 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -166,8 +166,33 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_sync_core(void *info)
+{
+ /*
+ * The smp_mb() in membarrier after all the IPIs is supposed to
+ * ensure that memory on remote CPUs that occur before the IPI
+ * become visible to membarrier()'s caller -- see scenario B in
+ * the big comment at the top of this file.
+ *
+ * A sync_core() would provide this guarantee, but
+ * sync_core_before_usermode() might end up being deferred until
+ * after membarrier()'s smp_mb().
+ */
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+
+ sync_core_before_usermode();
+}
+
static void ipi_rseq(void *info)
{
+ /*
+ * Ensure that all stores done by the calling thread are visible
+ * to the current task before the current task resumes. We could
+ * probably optimize this away on most architectures, but by the
+ * time we've already sent an IPI, the cost of the extra smp_mb()
+ * is negligible.
+ */
+ smp_mb();
rseq_preempt(current);
}
@@ -293,6 +318,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+ ipi_func = ipi_sync_core;
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
@@ -307,7 +333,8 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return -EPERM;
}
- if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
+ (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
return 0;
/*
@@ -326,8 +353,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
goto out;
- if (cpu_id == raw_smp_processor_id())
- goto out;
rcu_read_lock();
p = rcu_dereference(cpu_rq(cpu_id)->curr);
if (!p || p->mm != mm) {
@@ -342,16 +367,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
for_each_online_cpu(cpu) {
struct task_struct *p;
- /*
- * Skipping the current CPU is OK even through we can be
- * migrated at any point. The current CPU, at the point
- * where we read raw_smp_processor_id(), is ensured to
- * be in program order with respect to the caller
- * thread. Therefore, we can skip this CPU from the
- * iteration.
- */
- if (cpu == raw_smp_processor_id())
- continue;
p = rcu_dereference(cpu_rq(cpu)->curr);
if (p && p->mm == mm)
__cpumask_set_cpu(cpu, tmpmask);
@@ -359,12 +374,38 @@ static int membarrier_private_expedited(int flags, int cpu_id)
rcu_read_unlock();
}
- preempt_disable();
- if (cpu_id >= 0)
+ if (cpu_id >= 0) {
+ /*
+ * smp_call_function_single() will call ipi_func() if cpu_id
+ * is the calling CPU.
+ */
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
- else
- smp_call_function_many(tmpmask, ipi_func, NULL, 1);
- preempt_enable();
+ } else {
+ /*
+ * For regular membarrier, we can save a few cycles by
+ * skipping the current cpu -- we're about to do smp_mb()
+ * below, and if we migrate to a different cpu, this cpu
+ * and the new cpu will execute a full barrier in the
+ * scheduler.
+ *
+ * For SYNC_CORE, we do need a barrier on the current cpu --
+ * otherwise, if we are migrated and replaced by a different
+ * task in the same mm just before, during, or after
+ * membarrier, we will end up with some thread in the mm
+ * running without a core sync.
+ *
+ * For RSEQ, don't rseq_preempt() the caller. User code
+ * is not supposed to issue syscalls at all from inside an
+ * rseq critical section.
+ */
+ if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
+ preempt_disable();
+ smp_call_function_many(tmpmask, ipi_func, NULL, true);
+ preempt_enable();
+ } else {
+ on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
+ }
+ }
out:
if (cpu_id < 0)
diff --git a/kernel/scs.c b/kernel/scs.c
index 4ff4a7ba0094..e2a71fc82fa0 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -5,26 +5,49 @@
* Copyright (C) 2019 Google LLC
*/
+#include <linux/cpuhotplug.h>
#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/scs.h>
-#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <linux/vmstat.h>
-static struct kmem_cache *scs_cache;
-
static void __scs_account(void *s, int account)
{
- struct page *scs_page = virt_to_page(s);
+ struct page *scs_page = vmalloc_to_page(s);
mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB,
account * (SCS_SIZE / SZ_1K));
}
-static void *scs_alloc(int node)
+/* Matches NR_CACHED_STACKS for VMAP_STACK */
+#define NR_CACHED_SCS 2
+static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]);
+
+static void *__scs_alloc(int node)
{
- void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node);
+ int i;
+ void *s;
+
+ for (i = 0; i < NR_CACHED_SCS; i++) {
+ s = this_cpu_xchg(scs_cache[i], NULL);
+ if (s) {
+ kasan_unpoison_vmalloc(s, SCS_SIZE);
+ memset(s, 0, SCS_SIZE);
+ return s;
+ }
+ }
+
+ return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ GFP_SCS, PAGE_KERNEL, 0, node,
+ __builtin_return_address(0));
+}
+void *scs_alloc(int node)
+{
+ void *s;
+
+ s = __scs_alloc(node);
if (!s)
return NULL;
@@ -34,21 +57,47 @@ static void *scs_alloc(int node)
* Poison the allocation to catch unintentional accesses to
* the shadow stack when KASAN is enabled.
*/
- kasan_poison_object_data(scs_cache, s);
+ kasan_poison_vmalloc(s, SCS_SIZE);
__scs_account(s, 1);
return s;
}
-static void scs_free(void *s)
+void scs_free(void *s)
{
+ int i;
+
__scs_account(s, -1);
- kasan_unpoison_object_data(scs_cache, s);
- kmem_cache_free(scs_cache, s);
+
+ /*
+ * We cannot sleep as this can be called in interrupt context,
+ * so use this_cpu_cmpxchg to update the cache, and vfree_atomic
+ * to free the stack.
+ */
+
+ for (i = 0; i < NR_CACHED_SCS; i++)
+ if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
+ return;
+
+ vfree_atomic(s);
+}
+
+static int scs_cleanup(unsigned int cpu)
+{
+ int i;
+ void **cache = per_cpu_ptr(scs_cache, cpu);
+
+ for (i = 0; i < NR_CACHED_SCS; i++) {
+ vfree(cache[i]);
+ cache[i] = NULL;
+ }
+
+ return 0;
}
void __init scs_init(void)
{
- scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL);
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL,
+ scs_cleanup);
}
int scs_prepare(struct task_struct *tsk, int node)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 53a7d1512dd7..15f47fc11d13 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -356,14 +356,14 @@ static inline void seccomp_assign_mode(struct task_struct *task,
task->seccomp.mode = seccomp_mode;
/*
- * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and
* filter) is set.
*/
smp_mb__before_atomic();
/* Assume default seccomp processes want spec flaw mitigation. */
if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
arch_seccomp_spec_mitigate(task);
- set_tsk_thread_flag(task, TIF_SECCOMP);
+ set_task_syscall_work(task, SECCOMP);
}
#ifdef CONFIG_SECCOMP_FILTER
@@ -928,7 +928,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/*
* Make sure that any changes to mode from another thread have
- * been seen after TIF_SECCOMP was seen.
+ * been seen after SYSCALL_WORK_SECCOMP was seen.
*/
rmb();
diff --git a/kernel/signal.c b/kernel/signal.c
index ef8f2a28d37c..c37170655171 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -984,7 +984,7 @@ static inline bool wants_signal(int sig, struct task_struct *p)
if (task_is_stopped_or_traced(p))
return false;
- return task_curr(p) || !signal_pending(p);
+ return task_curr(p) || !task_sigpending(p);
}
static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
@@ -2524,12 +2524,46 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info)
return signr;
}
+static void hide_si_addr_tag_bits(struct ksignal *ksig)
+{
+ switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
+ case SIL_FAULT:
+ case SIL_FAULT_MCEERR:
+ case SIL_FAULT_BNDERR:
+ case SIL_FAULT_PKUERR:
+ ksig->info.si_addr = arch_untagged_si_addr(
+ ksig->info.si_addr, ksig->sig, ksig->info.si_code);
+ break;
+ case SIL_KILL:
+ case SIL_TIMER:
+ case SIL_POLL:
+ case SIL_CHLD:
+ case SIL_RT:
+ case SIL_SYS:
+ break;
+ }
+}
+
bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
int signr;
+ /*
+ * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
+ * that the arch handlers don't all have to do it. If we get here
+ * without TIF_SIGPENDING, just exit after running signal work.
+ */
+#ifdef TIF_NOTIFY_SIGNAL
+ if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+ tracehook_notify_signal();
+ if (!task_sigpending(current))
+ return false;
+ }
+#endif
+
if (unlikely(uprobe_deny_signal()))
return false;
@@ -2761,6 +2795,10 @@ relock:
spin_unlock_irq(&sighand->siglock);
ksig->sig = signr;
+
+ if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
+ hide_si_addr_tag_bits(ksig);
+
return ksig->sig > 0;
}
@@ -2823,7 +2861,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
/* Remove the signals this thread can handle. */
sigandsets(&retarget, &retarget, &t->blocked);
- if (!signal_pending(t))
+ if (!task_sigpending(t))
signal_wake_up(t, 0);
if (sigisemptyset(&retarget))
@@ -2857,7 +2895,7 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_end(tsk);
- if (!signal_pending(tsk))
+ if (!task_sigpending(tsk))
goto out;
unblocked = tsk->blocked;
@@ -2901,7 +2939,7 @@ long do_no_restart_syscall(struct restart_block *param)
static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
- if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
sigset_t newblocked;
/* A set of now blocked but previously unblocked signals. */
sigandnsets(&newblocked, newset, &current->blocked);
@@ -3985,6 +4023,22 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
if (oact)
*oact = *k;
+ /*
+ * Make sure that we never accidentally claim to support SA_UNSUPPORTED,
+ * e.g. by having an architecture use the bit in their uapi.
+ */
+ BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);
+
+ /*
+ * Clear unknown flag bits in order to allow userspace to detect missing
+ * support for flag bits and to allow the kernel to use non-uapi bits
+ * internally.
+ */
+ if (act)
+ act->sa.sa_flags &= UAPI_SA_FLAGS;
+ if (oact)
+ oact->sa.sa_flags &= UAPI_SA_FLAGS;
+
sigaction_compat_abi(act, oact);
if (act) {
diff --git a/kernel/sys.c b/kernel/sys.c
index a730c03ee607..51f00fe20e4d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -42,6 +42,7 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
break;
+ case PR_SET_SYSCALL_USER_DISPATCH:
+ error = set_syscall_user_dispatch(arg2, arg3, arg4,
+ (char __user *) arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afad085960b8..c9fbdd848138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2650,6 +2650,17 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
+#if defined(CONFIG_TREE_RCU)
+ {
+ .procname = "max_rcu_stall_to_panic",
+ .data = &sysctl_max_rcu_stall_to_panic,
+ .maxlen = sizeof(sysctl_max_rcu_stall_to_panic),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_INT_MAX,
+ },
+#endif
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
{
.procname = "stack_erasing",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8d6e1217c451..15b087286bea 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -5,6 +5,34 @@
static struct callback_head work_exited; /* all we need is ->next == NULL */
+/*
+ * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster
+ * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is
+ * shared for threads, and can cause contention on sighand->lock. Even for
+ * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking
+ * or IRQ disabling is involved for notification (or running) purposes.
+ */
+static void task_work_notify_signal(struct task_struct *task)
+{
+#if defined(TIF_NOTIFY_SIGNAL)
+ set_notify_signal(task);
+#else
+ unsigned long flags;
+
+ /*
+ * Only grab the sighand lock if we don't already have some
+ * task_work pending. This pairs with the smp_store_mb()
+ * in get_signal(), see comment there.
+ */
+ if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
+ lock_task_sighand(task, &flags)) {
+ task->jobctl |= JOBCTL_TASK_WORK;
+ signal_wake_up(task, 0);
+ unlock_task_sighand(task, &flags);
+ }
+#endif
+}
+
/**
* task_work_add - ask the @task to execute @work->func()
* @task: the task which should run the callback
@@ -33,7 +61,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
enum task_work_notify_mode notify)
{
struct callback_head *head;
- unsigned long flags;
do {
head = READ_ONCE(task->task_works);
@@ -49,17 +76,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
set_notify_resume(task);
break;
case TWA_SIGNAL:
- /*
- * Only grab the sighand lock if we don't already have some
- * task_work pending. This pairs with the smp_store_mb()
- * in get_signal(), see comment there.
- */
- if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) &&
- lock_task_sighand(task, &flags)) {
- task->jobctl |= JOBCTL_TASK_WORK;
- signal_wake_up(task, 0);
- unlock_task_sighand(task, &flags);
- }
+ task_work_notify_signal(task);
break;
default:
WARN_ON_ONCE(1);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 387b4bef7dd1..743c852e10f2 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1284,7 +1284,7 @@ int hrtimer_cancel(struct hrtimer *timer)
EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
- * hrtimer_get_remaining - get remaining time for the timer
+ * __hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
* @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index eddcf4970444..a5cffe2a1770 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -59,7 +59,8 @@ static struct clocksource clocksource_jiffies = {
};
__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
-__cacheline_aligned_in_smp seqcount_t jiffies_seq;
+__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq =
+ SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, &jiffies_lock);
#if (BITS_PER_LONG < 64)
u64 get_jiffies_64(void)
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index afc65e6be33e..6ca625f5e554 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -92,7 +92,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
if (!ns)
goto fail_dec;
- kref_init(&ns->kref);
+ refcount_set(&ns->ns.count, 1);
ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!ns->vvar_page)
@@ -226,11 +226,8 @@ out:
mutex_unlock(&offset_lock);
}
-void free_time_ns(struct kref *kref)
+void free_time_ns(struct time_namespace *ns)
{
- struct time_namespace *ns;
-
- ns = container_of(kref, struct time_namespace, kref);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
@@ -308,22 +305,20 @@ static int timens_install(struct nsset *nsset, struct ns_common *new)
return 0;
}
-int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
+void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk)
{
struct ns_common *nsc = &nsproxy->time_ns_for_children->ns;
struct time_namespace *ns = to_time_ns(nsc);
/* create_new_namespaces() already incremented the ref counter */
if (nsproxy->time_ns == nsproxy->time_ns_for_children)
- return 0;
+ return;
get_time_ns(ns);
put_time_ns(nsproxy->time_ns);
nsproxy->time_ns = ns;
timens_commit(tsk, ns);
-
- return 0;
}
static struct user_namespace *timens_owner(struct ns_common *ns)
@@ -464,7 +459,7 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .kref = KREF_INIT(3),
+ .ns.count = REFCOUNT_INIT(3),
.user_ns = &init_user_ns,
.ns.inum = PROC_TIME_INIT_INO,
.ns.ops = &timens_operations,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 069ca78fb0bf..7404d3831527 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -494,65 +494,74 @@ out:
return leap;
}
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
static void sync_hw_clock(struct work_struct *work);
-static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
-
-static void sched_sync_hw_clock(struct timespec64 now,
- unsigned long target_nsec, bool fail)
+static DECLARE_WORK(sync_work, sync_hw_clock);
+static struct hrtimer sync_hrtimer;
+#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC)
+static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer)
{
- struct timespec64 next;
-
- ktime_get_real_ts64(&next);
- if (!fail)
- next.tv_sec = 659;
- else {
- /*
- * Try again as soon as possible. Delaying long periods
- * decreases the accuracy of the work queue timer. Due to this
- * the algorithm is very likely to require a short-sleep retry
- * after the above long sleep to synchronize ts_nsec.
- */
- next.tv_sec = 0;
- }
-
- /* Compute the needed delay that will get to tv_nsec == target_nsec */
- next.tv_nsec = target_nsec - next.tv_nsec;
- if (next.tv_nsec <= 0)
- next.tv_nsec += NSEC_PER_SEC;
- if (next.tv_nsec >= NSEC_PER_SEC) {
- next.tv_sec++;
- next.tv_nsec -= NSEC_PER_SEC;
- }
+ queue_work(system_power_efficient_wq, &sync_work);
- queue_delayed_work(system_power_efficient_wq, &sync_work,
- timespec64_to_jiffies(&next));
+ return HRTIMER_NORESTART;
}
-static void sync_rtc_clock(void)
+static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
{
- unsigned long target_nsec;
- struct timespec64 adjust, now;
- int rc;
+ ktime_t exp = ktime_set(ktime_get_real_seconds(), 0);
- if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
- return;
+ if (retry)
+ exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec);
+ else
+ exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec);
- ktime_get_real_ts64(&now);
+ hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS);
+}
- adjust = now;
- if (persistent_clock_is_local)
- adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+/*
+ * Check whether @now is correct versus the required time to update the RTC
+ * and calculate the value which needs to be written to the RTC so that the
+ * next seconds increment of the RTC after the write is aligned with the next
+ * seconds increment of clock REALTIME.
+ *
+ * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds
+ *
+ * t2.tv_nsec == 0
+ * tsched = t2 - set_offset_nsec
+ * newval = t2 - NSEC_PER_SEC
+ *
+ * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC
+ *
+ * As the execution of this code is not guaranteed to happen exactly at
+ * tsched this allows it to happen within a fuzzy region:
+ *
+ * abs(now - tsched) < FUZZ
+ *
+ * If @now is not inside the allowed window the function returns false.
+ */
+static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
+ struct timespec64 *to_set,
+ const struct timespec64 *now)
+{
+ /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+ const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
+ struct timespec64 delay = {.tv_sec = -1,
+ .tv_nsec = set_offset_nsec};
- /*
- * The current RTC in use will provide the target_nsec it wants to be
- * called at, and does rtc_tv_nsec_ok internally.
- */
- rc = rtc_set_ntp_time(adjust, &target_nsec);
- if (rc == -ENODEV)
- return;
+ *to_set = timespec64_add(*now, delay);
+
+ if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
+ to_set->tv_nsec = 0;
+ return true;
+ }
- sched_sync_hw_clock(now, target_nsec, rc);
+ if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
+ to_set->tv_sec++;
+ to_set->tv_nsec = 0;
+ return true;
+ }
+ return false;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -560,48 +569,47 @@ int __weak update_persistent_clock64(struct timespec64 now64)
{
return -ENODEV;
}
+#else
+static inline int update_persistent_clock64(struct timespec64 now64)
+{
+ return -ENODEV;
+}
#endif
-static bool sync_cmos_clock(void)
+#ifdef CONFIG_RTC_SYSTOHC
+/* Save NTP synchronized time to the RTC */
+static int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec)
{
- static bool no_cmos;
- struct timespec64 now;
- struct timespec64 adjust;
- int rc = -EPROTO;
- long target_nsec = NSEC_PER_SEC / 2;
+ struct rtc_device *rtc;
+ struct rtc_time tm;
+ int err = -ENODEV;
- if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
- return false;
+ rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
+ if (!rtc)
+ return -ENODEV;
- if (no_cmos)
- return false;
+ if (!rtc->ops || !rtc->ops->set_time)
+ goto out_close;
- /*
- * Historically update_persistent_clock64() has followed x86
- * semantics, which match the MC146818A/etc RTC. This RTC will store
- * 'adjust' and then in .5s it will advance once second.
- *
- * Architectures are strongly encouraged to use rtclib and not
- * implement this legacy API.
- */
- ktime_get_real_ts64(&now);
- if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
- if (persistent_clock_is_local)
- adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
- rc = update_persistent_clock64(adjust);
- /*
- * The machine does not support update_persistent_clock64 even
- * though it defines CONFIG_GENERIC_CMOS_UPDATE.
- */
- if (rc == -ENODEV) {
- no_cmos = true;
- return false;
- }
+ /* First call might not have the correct offset */
+ if (*offset_nsec == rtc->set_offset_nsec) {
+ rtc_time64_to_tm(to_set->tv_sec, &tm);
+ err = rtc_set_time(rtc, &tm);
+ } else {
+ /* Store the update offset and let the caller try again */
+ *offset_nsec = rtc->set_offset_nsec;
+ err = -EAGAIN;
}
-
- sched_sync_hw_clock(now, target_nsec, rc);
- return true;
+out_close:
+ rtc_class_close(rtc);
+ return err;
+}
+#else
+static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec)
+{
+ return -ENODEV;
}
+#endif
/*
* If we have an externally synchronized Linux clock, then update RTC clock
@@ -613,24 +621,64 @@ static bool sync_cmos_clock(void)
*/
static void sync_hw_clock(struct work_struct *work)
{
- if (!ntp_synced())
- return;
+ /*
+ * The default synchronization offset is 500ms for the deprecated
+ * update_persistent_clock64() under the assumption that it uses
+ * the infamous CMOS clock (MC146818).
+ */
+ static unsigned long offset_nsec = NSEC_PER_SEC / 2;
+ struct timespec64 now, to_set;
+ int res = -EAGAIN;
- if (sync_cmos_clock())
+ /*
+ * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer()
+ * managed to schedule the work between the timer firing and the
+ * work being able to rearm the timer. Wait for the timer to expire.
+ */
+ if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer))
return;
- sync_rtc_clock();
+ ktime_get_real_ts64(&now);
+ /* If @now is not in the allowed window, try again */
+ if (!rtc_tv_nsec_ok(offset_nsec, &to_set, &now))
+ goto rearm;
+
+ /* Take timezone adjusted RTCs into account */
+ if (persistent_clock_is_local)
+ to_set.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+ /* Try the legacy RTC first. */
+ res = update_persistent_clock64(to_set);
+ if (res != -ENODEV)
+ goto rearm;
+
+ /* Try the RTC class */
+ res = update_rtc(&to_set, &offset_nsec);
+ if (res == -ENODEV)
+ return;
+rearm:
+ sched_sync_hw_clock(offset_nsec, res != 0);
}
void ntp_notify_cmos_timer(void)
{
- if (!ntp_synced())
- return;
+ /*
+ * When the work is currently executed but has not yet the timer
+ * rearmed this queues the work immediately again. No big issue,
+ * just a pointless work scheduled.
+ */
+ if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer))
+ queue_work(system_power_efficient_wq, &sync_work);
+}
- if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
- IS_ENABLED(CONFIG_RTC_SYSTOHC))
- queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+static void __init ntp_init_cmos_sync(void)
+{
+ hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ sync_hrtimer.function = sync_timer_callback;
}
+#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
+static inline void __init ntp_init_cmos_sync(void) { }
+#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
/*
* Propagate a new txc->status value into the NTP state:
@@ -1044,4 +1092,5 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
ntp_clear();
+ ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 908ecaa65fc3..23d1b74c3065 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -12,4 +12,11 @@ extern int __do_adjtimex(struct __kernel_timex *txc,
const struct timespec64 *ts,
s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
+
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
+extern void ntp_notify_cmos_timer(void);
+#else
+static inline void ntp_notify_cmos_timer(void) { }
+#endif
+
#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 36d7464c8962..5a23829372c7 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -331,7 +331,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
bc_local = tick_do_periodic_broadcast();
if (clockevent_state_oneshot(dev)) {
- ktime_t next = ktime_add(dev->next_event, tick_period);
+ ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC);
clockevents_program_event(dev, next, true);
}
@@ -877,6 +877,22 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
}
}
+static inline ktime_t tick_get_next_period(void)
+{
+ ktime_t next;
+
+ /*
+ * Protect against concurrent updates (store /load tearing on
+ * 32bit). It does not matter if the time is already in the
+ * past. The broadcast device which is about to be programmed will
+ * fire in any case.
+ */
+ raw_spin_lock(&jiffies_lock);
+ next = tick_next_period;
+ raw_spin_unlock(&jiffies_lock);
+ return next;
+}
+
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
*/
@@ -905,10 +921,11 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_oneshot_mask, tmpmask);
if (was_periodic && !cpumask_empty(tmpmask)) {
+ ktime_t nextevt = tick_get_next_period();
+
clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
- tick_broadcast_init_next_event(tmpmask,
- tick_next_period);
- tick_broadcast_set_event(bc, cpu, tick_next_period);
+ tick_broadcast_init_next_event(tmpmask, nextevt);
+ tick_broadcast_set_event(bc, cpu, nextevt);
} else
bc->next_event = KTIME_MAX;
} else {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 6c9c342dd0e5..a03764df5366 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -27,10 +27,11 @@
*/
DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
/*
- * Tick next event: keeps track of the tick time
+ * Tick next event: keeps track of the tick time. It's updated by the
+ * CPU which handles the tick and protected by jiffies_lock. There is
+ * no requirement to write hold the jiffies seqcount for it.
*/
ktime_t tick_next_period;
-ktime_t tick_period;
/*
* tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
@@ -88,7 +89,7 @@ static void tick_periodic(int cpu)
write_seqcount_begin(&jiffies_seq);
/* Keep track of the next tick event */
- tick_next_period = ktime_add(tick_next_period, tick_period);
+ tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC);
do_timer(1);
write_seqcount_end(&jiffies_seq);
@@ -127,7 +128,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
* Setup the next period for devices, which do not have
* periodic mode:
*/
- next = ktime_add(next, tick_period);
+ next = ktime_add_ns(next, TICK_NSEC);
if (!clockevents_program_event(dev, next, false))
return;
@@ -173,7 +174,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
for (;;) {
if (!clockevents_program_event(dev, next, false))
return;
- next = ktime_add(next, tick_period);
+ next = ktime_add_ns(next, TICK_NSEC);
}
}
}
@@ -220,7 +221,6 @@ static void tick_setup_device(struct tick_device *td,
tick_do_timer_cpu = cpu;
tick_next_period = ktime_get();
- tick_period = NSEC_PER_SEC / HZ;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7b2496136729..7a981c9e87a4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -15,7 +15,6 @@
DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern ktime_t tick_next_period;
-extern ktime_t tick_period;
extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b734070f028..030282994b3e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
+#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
@@ -44,7 +45,9 @@ struct tick_sched *tick_get_tick_sched(int cpu)
#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ * The time, when the last jiffy update happened. Write access must hold
+ * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
+ * consistent view of jiffies and last_jiffies_update.
*/
static ktime_t last_jiffies_update;
@@ -53,50 +56,97 @@ static ktime_t last_jiffies_update;
*/
static void tick_do_update_jiffies64(ktime_t now)
{
- unsigned long ticks = 0;
- ktime_t delta;
+ unsigned long ticks = 1;
+ ktime_t delta, nextp;
/*
- * Do a quick check without holding jiffies_lock:
- * The READ_ONCE() pairs with two updates done later in this function.
+ * 64bit can do a quick check without holding jiffies lock and
+ * without looking at the sequence count. The smp_load_acquire()
+ * pairs with the update done later in this function.
+ *
+ * 32bit cannot do that because the store of tick_next_period
+ * consists of two 32bit stores and the first store could move it
+ * to a random point in the future.
*/
- delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
- if (delta < tick_period)
- return;
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ if (ktime_before(now, smp_load_acquire(&tick_next_period)))
+ return;
+ } else {
+ unsigned int seq;
- /* Reevaluate with jiffies_lock held */
+ /*
+ * Avoid contention on jiffies_lock and protect the quick
+ * check with the sequence count.
+ */
+ do {
+ seq = read_seqcount_begin(&jiffies_seq);
+ nextp = tick_next_period;
+ } while (read_seqcount_retry(&jiffies_seq, seq));
+
+ if (ktime_before(now, nextp))
+ return;
+ }
+
+ /* Quick check failed, i.e. update is required. */
raw_spin_lock(&jiffies_lock);
+ /*
+ * Reevaluate with the lock held. Another CPU might have done the
+ * update already.
+ */
+ if (ktime_before(now, tick_next_period)) {
+ raw_spin_unlock(&jiffies_lock);
+ return;
+ }
+
write_seqcount_begin(&jiffies_seq);
- delta = ktime_sub(now, last_jiffies_update);
- if (delta >= tick_period) {
+ delta = ktime_sub(now, tick_next_period);
+ if (unlikely(delta >= TICK_NSEC)) {
+ /* Slow path for long idle sleep times */
+ s64 incr = TICK_NSEC;
- delta = ktime_sub(delta, tick_period);
- /* Pairs with the lockless read in this function. */
- WRITE_ONCE(last_jiffies_update,
- ktime_add(last_jiffies_update, tick_period));
+ ticks += ktime_divns(delta, incr);
- /* Slow path for long timeouts */
- if (unlikely(delta >= tick_period)) {
- s64 incr = ktime_to_ns(tick_period);
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ incr * ticks);
+ } else {
+ last_jiffies_update = ktime_add_ns(last_jiffies_update,
+ TICK_NSEC);
+ }
- ticks = ktime_divns(delta, incr);
+ /* Advance jiffies to complete the jiffies_seq protected job */
+ jiffies_64 += ticks;
- /* Pairs with the lockless read in this function. */
- WRITE_ONCE(last_jiffies_update,
- ktime_add_ns(last_jiffies_update,
- incr * ticks));
- }
- do_timer(++ticks);
+ /*
+ * Keep the tick_next_period variable up to date.
+ */
+ nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
- /* Keep the tick_next_period variable up to date */
- tick_next_period = ktime_add(last_jiffies_update, tick_period);
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ /*
+ * Pairs with smp_load_acquire() in the lockless quick
+ * check above and ensures that the update to jiffies_64 is
+ * not reordered vs. the store to tick_next_period, neither
+ * by the compiler nor by the CPU.
+ */
+ smp_store_release(&tick_next_period, nextp);
} else {
- write_seqcount_end(&jiffies_seq);
- raw_spin_unlock(&jiffies_lock);
- return;
+ /*
+ * A plain store is good enough on 32bit as the quick check
+ * above is protected by the sequence count.
+ */
+ tick_next_period = nextp;
}
+
+ /*
+ * Release the sequence count. calc_global_load() below is not
+ * protected by it, but jiffies_lock needs to be held to prevent
+ * concurrent invocations.
+ */
write_seqcount_end(&jiffies_seq);
+
+ calc_global_load();
+
raw_spin_unlock(&jiffies_lock);
update_wall_time();
}
@@ -659,7 +709,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
/* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
hrtimer_start_expires(&ts->sched_timer,
@@ -1228,7 +1278,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
if (unlikely(ts->tick_stopped))
return;
- hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
@@ -1265,7 +1315,7 @@ static void tick_nohz_switch_to_nohz(void)
next = tick_init_jiffy_update();
hrtimer_set_expires(&ts->sched_timer, next);
- hrtimer_forward_now(&ts->sched_timer, tick_period);
+ hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
@@ -1331,7 +1381,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
- hrtimer_forward(timer, now, tick_period);
+ hrtimer_forward(timer, now, TICK_NSEC);
return HRTIMER_RESTART;
}
@@ -1365,13 +1415,13 @@ void tick_setup_sched_timer(void)
/* Offset the tick to avert jiffies_lock contention. */
if (sched_skew_tick) {
- u64 offset = ktime_to_ns(tick_period) >> 1;
+ u64 offset = TICK_NSEC >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
- hrtimer_forward(&ts->sched_timer, now, tick_period);
+ hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 589e0a552129..62e3b46717a6 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -70,10 +70,10 @@ static const unsigned short __mon_yday[2][13] = {
/**
* time64_to_tm - converts the calendar time to local broken-down time
*
- * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970,
* Coordinated Universal Time (UTC).
- * @offset offset seconds adding to totalsecs.
- * @result pointer to struct tm variable to receive broken-down time
+ * @offset: offset seconds adding to totalsecs.
+ * @result: pointer to struct tm variable to receive broken-down time
*/
void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6858a31364b6..74503c0151e5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -407,6 +407,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
+ * @tkf: Pointer to NMI safe timekeeper
*
* We want to use this from any context including NMI and tracing /
* instrumenting the timekeeping code itself.
@@ -436,6 +437,27 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
memcpy(base + 1, base, sizeof(*base));
}
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount_latch(&tkf->seq);
+ tkr = tkf->base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base);
+
+ now += timekeeping_delta_to_ns(tkr,
+ clocksource_delta(
+ tk_clock_read(tkr),
+ tkr->cycle_last,
+ tkr->mask));
+ } while (read_seqcount_latch_retry(&tkf->seq, seq));
+
+ return now;
+}
+
/**
* ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
*
@@ -462,39 +484,24 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
*
* So reader 6 will observe time going backwards versus reader 5.
*
- * While other CPUs are likely to be able observe that, the only way
+ * While other CPUs are likely to be able to observe that, the only way
* for a CPU local observation is when an NMI hits in the middle of
* the update. Timestamps taken from that NMI context might be ahead
* of the following timestamps. Callers need to be aware of that and
* deal with it.
*/
-static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
-{
- struct tk_read_base *tkr;
- unsigned int seq;
- u64 now;
-
- do {
- seq = raw_read_seqcount_latch(&tkf->seq);
- tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base);
-
- now += timekeeping_delta_to_ns(tkr,
- clocksource_delta(
- tk_clock_read(tkr),
- tkr->cycle_last,
- tkr->mask));
- } while (read_seqcount_latch_retry(&tkf->seq, seq));
-
- return now;
-}
-
u64 ktime_get_mono_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+/**
+ * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
+ *
+ * Contrary to ktime_get_mono_fast_ns() this is always correct because the
+ * conversion factor is not affected by NTP/PTP correction.
+ */
u64 ktime_get_raw_fast_ns(void)
{
return __ktime_get_fast_ns(&tk_fast_raw);
@@ -521,6 +528,9 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
* (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
* partially updated. Since the tk->offs_boot update is a rare event, this
* should be a rare occurrence which postprocessing should be able to handle.
+ *
+ * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns()
+ * apply as well.
*/
u64 notrace ktime_get_boot_fast_ns(void)
{
@@ -530,9 +540,6 @@ u64 notrace ktime_get_boot_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-/*
- * See comment for __ktime_get_fast_ns() vs. timestamp ordering
- */
static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
struct tk_read_base *tkr;
@@ -557,6 +564,8 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
/**
* ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ *
+ * See ktime_get_fast_ns() for documentation of the time stamp ordering.
*/
u64 ktime_get_real_fast_ns(void)
{
@@ -654,6 +663,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ * @nb: Pointer to the notifier block to register
*/
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
@@ -673,6 +683,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
+ * @nb: Pointer to the notifier block to unregister
*/
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
@@ -763,6 +774,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
/**
* timekeeping_forward_now - update clock to the current time
+ * @tk: Pointer to the timekeeper to update
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
@@ -1339,7 +1351,7 @@ EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
- * @tv: pointer to the timespec variable containing the offset
+ * @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
@@ -1415,9 +1427,8 @@ void timekeeping_warp_clock(void)
}
}
-/**
+/*
* __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
- *
*/
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
@@ -1425,7 +1436,7 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}
-/**
+/*
* change_clocksource - Swaps clocksources if a new one is available
*
* Accumulates current time interval and initializes new clocksource
@@ -1548,6 +1559,7 @@ u64 timekeeping_max_deferment(void)
/**
* read_persistent_clock64 - Return time from the persistent clock.
+ * @ts: Pointer to the storage for the readout value
*
* Weak dummy function for arches that do not yet support it.
* Reads the time from the battery backed persistent clock.
@@ -1566,8 +1578,9 @@ void __weak read_persistent_clock64(struct timespec64 *ts)
* from the boot.
*
* Weak dummy function for arches that do not yet support it.
- * wall_time - current time as returned by persistent clock
- * boot_offset - offset that is defined as wall_time - boot_time
+ * @wall_time: - current time as returned by persistent clock
+ * @boot_offset: - offset that is defined as wall_time - boot_time
+ *
* The default function calculates offset based on the current value of
* local_clock(). This way architectures that support sched_clock() but don't
* support dedicated boot time clock will provide the best estimate of the
@@ -1652,7 +1665,8 @@ static struct timespec64 timekeeping_suspend_time;
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
- * @delta: pointer to a timespec delta value
+ * @tk: Pointer to the timekeeper to be updated
+ * @delta: Pointer to the delta value in timespec64 format
*
* Takes a timespec offset measuring a suspend interval and properly
* adds the sleep offset to the timekeeping variables.
@@ -2023,13 +2037,12 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
}
-/**
+/*
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
* Helper function that accumulates the nsecs greater than a second
* from the xtime_nsec field to the xtime_secs field.
* It also calls into the NTP code to handle leapsecond processing.
- *
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
@@ -2071,7 +2084,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
return clock_set;
}
-/**
+/*
* logarithmic_accumulation - shifted accumulation of cycles
*
* This functions accumulates a shifted interval of cycles into
@@ -2314,7 +2327,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
return base;
}
-/**
+/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 099737f6f10c..6c2cbd9ef999 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -26,7 +26,7 @@ extern void do_timer(unsigned long ticks);
extern void update_wall_time(void);
extern raw_spinlock_t jiffies_lock;
-extern seqcount_t jiffies_seq;
+extern seqcount_raw_spinlock_t jiffies_seq;
#define CS_NAME_LEN 32
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c3ad64fb9d8b..8dbc008f8942 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1283,7 +1283,7 @@ static void del_timer_wait_running(struct timer_list *timer)
u32 tf;
tf = READ_ONCE(timer->flags);
- if (!(tf & TIMER_MIGRATING)) {
+ if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
struct timer_base *base = get_timer_base(tf);
/*
@@ -1367,6 +1367,13 @@ int del_timer_sync(struct timer_list *timer)
*/
WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
+ /*
+ * Must be able to sleep on PREEMPT_RT because of the slowpath in
+ * del_timer_wait_running().
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
+ lockdep_assert_preemption_enabled();
+
do {
ret = try_to_del_timer_sync(timer);
@@ -1693,29 +1700,6 @@ void timer_clear_idle(void)
}
#endif
-/*
- * Called from the timer interrupt handler to charge one tick to the current
- * process. user_tick is 1 if the tick is user time, 0 for system.
- */
-void update_process_times(int user_tick)
-{
- struct task_struct *p = current;
-
- PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
- /* Note: this timer irq context must be accounted for as well. */
- account_process_tick(p, user_tick);
- run_local_timers();
- rcu_sched_clock_irq(user_tick);
-#ifdef CONFIG_IRQ_WORK
- if (in_irq())
- irq_work_tick();
-#endif
- scheduler_tick();
- if (IS_ENABLED(CONFIG_POSIX_TIMERS))
- run_posix_cpu_timers();
-}
-
/**
* __run_timers - run all expired timers (if any) on this CPU.
* @base: the timer vector to be processed.
@@ -1765,7 +1749,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
/*
* Called by the local, per-CPU timer interrupt on SMP.
*/
-void run_local_timers(void)
+static void run_local_timers(void)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
@@ -1783,6 +1767,29 @@ void run_local_timers(void)
}
/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+ struct task_struct *p = current;
+
+ PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
+
+ /* Note: this timer irq context must be accounted for as well. */
+ account_process_tick(p, user_tick);
+ run_local_timers();
+ rcu_sched_clock_irq(user_tick);
+#ifdef CONFIG_IRQ_WORK
+ if (in_irq())
+ irq_work_tick();
+#endif
+ scheduler_tick();
+ if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+ run_posix_cpu_timers();
+}
+
+/*
* Since schedule_timeout()'s timer is defined on the stack, it must store
* the target task on the stack as well.
*/
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index acb326f5f50a..6939140ab7c5 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -42,24 +42,11 @@ static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
va_end(args);
}
-static void print_name_offset(struct seq_file *m, void *sym)
-{
- char symname[KSYM_NAME_LEN];
-
- if (lookup_symbol_name((unsigned long)sym, symname) < 0)
- SEQ_printf(m, "<%pK>", sym);
- else
- SEQ_printf(m, "%s", symname);
-}
-
static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: ", idx);
- print_name_offset(m, taddr);
- SEQ_printf(m, ", ");
- print_name_offset(m, timer->function);
+ SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -116,9 +103,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
- SEQ_printf(m, " .get_time: ");
- print_name_offset(m, base->get_time);
- SEQ_printf(m, "\n");
+ SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
SEQ_printf(m, " .offset: %Lu nsecs\n",
(unsigned long long) ktime_to_ns(base->offset));
@@ -218,42 +203,29 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
SEQ_printf(m, " next_event: %Ld nsecs\n",
(unsigned long long) ktime_to_ns(dev->next_event));
- SEQ_printf(m, " set_next_event: ");
- print_name_offset(m, dev->set_next_event);
- SEQ_printf(m, "\n");
+ SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event);
- if (dev->set_state_shutdown) {
- SEQ_printf(m, " shutdown: ");
- print_name_offset(m, dev->set_state_shutdown);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_shutdown)
+ SEQ_printf(m, " shutdown: %ps\n",
+ dev->set_state_shutdown);
- if (dev->set_state_periodic) {
- SEQ_printf(m, " periodic: ");
- print_name_offset(m, dev->set_state_periodic);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_periodic)
+ SEQ_printf(m, " periodic: %ps\n",
+ dev->set_state_periodic);
- if (dev->set_state_oneshot) {
- SEQ_printf(m, " oneshot: ");
- print_name_offset(m, dev->set_state_oneshot);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot)
+ SEQ_printf(m, " oneshot: %ps\n",
+ dev->set_state_oneshot);
- if (dev->set_state_oneshot_stopped) {
- SEQ_printf(m, " oneshot stopped: ");
- print_name_offset(m, dev->set_state_oneshot_stopped);
- SEQ_printf(m, "\n");
- }
+ if (dev->set_state_oneshot_stopped)
+ SEQ_printf(m, " oneshot stopped: %ps\n",
+ dev->set_state_oneshot_stopped);
- if (dev->tick_resume) {
- SEQ_printf(m, " resume: ");
- print_name_offset(m, dev->tick_resume);
- SEQ_printf(m, "\n");
- }
+ if (dev->tick_resume)
+ SEQ_printf(m, " resume: %ps\n",
+ dev->tick_resume);
- SEQ_printf(m, " event_handler: ");
- print_name_offset(m, dev->event_handler);
+ SEQ_printf(m, " event_handler: %ps\n", dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
SEQ_printf(m, "\n");
diff --git a/kernel/torture.c b/kernel/torture.c
index 1061492f14bd..8562ac18d2eb 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -602,18 +602,29 @@ static int stutter_gap;
*/
bool stutter_wait(const char *title)
{
- int spt;
+ ktime_t delay;
+ unsigned int i = 0;
bool ret = false;
+ int spt;
cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
- ret = true;
+ if (!ret) {
+ sched_set_normal(current, MAX_NICE);
+ ret = true;
+ }
if (spt == 1) {
schedule_timeout_interruptible(1);
} else if (spt == 2) {
- while (READ_ONCE(stutter_pause_test))
+ while (READ_ONCE(stutter_pause_test)) {
+ if (!(i++ & 0xffff)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ delay = 10 * NSEC_PER_USEC;
+ schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
+ }
cond_resched();
+ }
} else {
schedule_timeout_interruptible(round_jiffies_relative(HZ));
}
@@ -629,20 +640,27 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
+ ktime_t delay;
+ DEFINE_TORTURE_RANDOM(rand);
int wtime;
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
if (!torture_must_stop() && stutter > 1) {
wtime = stutter;
- if (stutter > HZ + 1) {
+ if (stutter > 2) {
WRITE_ONCE(stutter_pause_test, 1);
- wtime = stutter - HZ - 1;
- schedule_timeout_interruptible(wtime);
- wtime = HZ + 1;
+ wtime = stutter - 3;
+ delay = ktime_divns(NSEC_PER_SEC * wtime, HZ);
+ delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
+ wtime = 2;
}
WRITE_ONCE(stutter_pause_test, 2);
- schedule_timeout_interruptible(wtime);
+ delay = ktime_divns(NSEC_PER_SEC * wtime, HZ);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_hrtimeout(&delay, HRTIMER_MODE_REL);
}
WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a4020c0b4508..e1bf5228fb69 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS
config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
def_bool y
- depends on DYNAMIC_FTRACE
+ depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
config FUNCTION_PROFILER
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 818cf8ad6f76..852226dd4b11 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1337,9 +1337,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
- case BPF_FUNC_bpf_per_cpu_ptr:
+ case BPF_FUNC_per_cpu_ptr:
return &bpf_per_cpu_ptr_proto;
- case BPF_FUNC_bpf_this_cpu_ptr:
+ case BPF_FUNC_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
default:
return NULL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8185f7240095..9c1bba8cc51b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
static struct ftrace_ops *
ftrace_find_tramp_ops_any(struct dyn_ftrace *rec);
static struct ftrace_ops *
+ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude);
+static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops);
static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
@@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
* to it.
*/
if (ftrace_rec_count(rec) == 1 &&
- ftrace_find_tramp_ops_any(rec))
+ ftrace_find_tramp_ops_any_other(rec, ops))
rec->flags |= FTRACE_FL_TRAMP;
else
rec->flags &= ~FTRACE_FL_TRAMP;
@@ -2245,6 +2247,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (op == op_exclude || !op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
struct ftrace_ops *op)
{
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc83b3fa9fe7..a6268e09160a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3234,14 +3234,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* See if we shot pass the end of this buffer page */
if (unlikely(write > BUF_PAGE_SIZE)) {
- if (tail != w) {
- /* before and after may now different, fix it up*/
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- if (a_ok && b_ok && info->before != info->after)
- (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
- info->before, info->after);
- }
+ /* before and after may now different, fix it up*/
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ if (a_ok && b_ok && info->before != info->after)
+ (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+ info->before, info->after);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3287,11 +3285,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
ts = rb_time_stamp(cpu_buffer->buffer);
barrier();
/*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
- info->after < ts) {
+ info->after < ts &&
+ rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ info->after, ts)) {
/* Nothing came after this event between C and E */
info->delta = ts - info->after;
- (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, info->ts);
info->ts = ts;
} else {
/*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 410cfeb16db5..06134189e9a7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -163,7 +163,8 @@ static union trace_eval_map_item *trace_eval_maps;
#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
int tracing_set_tracer(struct trace_array *tr, const char *buf);
-static void ftrace_trace_userstack(struct trace_buffer *buffer,
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned long flags, int pc);
#define MAX_TRACER_SIZE 100
@@ -2870,7 +2871,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* two. They are not that meaningful.
*/
ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
- ftrace_trace_userstack(buffer, flags, pc);
+ ftrace_trace_userstack(tr, buffer, flags, pc);
}
/*
@@ -3056,13 +3057,14 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
static DEFINE_PER_CPU(int, user_stack_count);
static void
-ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
+ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
- if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
/*
@@ -3101,7 +3103,8 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc)
preempt_enable();
}
#else /* CONFIG_USER_STACKTRACE_SUPPORT */
-static void ftrace_trace_userstack(struct trace_buffer *buffer,
+static void ftrace_trace_userstack(struct trace_array *tr,
+ struct trace_buffer *buffer,
unsigned long flags, int pc)
{
}
@@ -3534,7 +3537,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
}
#define STATIC_TEMP_BUF_SIZE 128
-static char static_temp_buf[STATIC_TEMP_BUF_SIZE];
+static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4);
/* Find the next real entry, without updating the iterator itself */
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 47a71f96e5bc..adf65b502453 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3428,10 +3428,10 @@ static __init int event_trace_enable(void)
* initialize events and perhaps start any events that are on the
* command line. Unfortunately, there are some events that will not
* start this early, like the system call tracepoints that need
- * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable()
- * is called before pid 1 starts, and this flag is never set, making
- * the syscall tracepoint never get reached, but the event is enabled
- * regardless (and not doing anything).
+ * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But
+ * event_trace_enable() is called before pid 1 starts, and this flag
+ * is never set, making the syscall tracepoint never get reached, but
+ * the event is enabled regardless (and not doing anything).
*/
static __init int event_trace_enable_again(void)
{
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index c9ad5c6fbaad..d071fc271eef 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -368,7 +368,7 @@ static int start_kthread(struct trace_array *tr)
struct task_struct *kthread;
int next_cpu;
- if (WARN_ON(hwlat_kthread))
+ if (hwlat_kthread)
return 0;
/* Just pick the first CPU on first iteration */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b911e9f6d9f5..97c7a7782db7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1731,7 +1731,8 @@ NOKPROBE_SYMBOL(kprobe_dispatcher);
static int
kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
- struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
+ struct kretprobe *rp = get_kretprobe(ri);
+ struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp);
raw_cpu_inc(*tk->nhit);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3f659f855074..7261fa0f5e3c 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -594,7 +594,7 @@ int syscall_regfunc(void)
if (!sys_tracepoint_refcount) {
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
- set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ set_task_syscall_work(t, SYSCALL_TRACEPOINT);
}
read_unlock(&tasklist_lock);
}
@@ -611,7 +611,7 @@ void syscall_unregfunc(void)
if (!sys_tracepoint_refcount) {
read_lock(&tasklist_lock);
for_each_process_thread(p, t) {
- clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ clear_task_syscall_work(t, SYSCALL_TRACEPOINT);
}
read_unlock(&tasklist_lock);
}
diff --git a/kernel/user.c b/kernel/user.c
index b1635d94a1f2..a2478cddf536 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,7 +55,7 @@ struct user_namespace init_user_ns = {
},
},
},
- .count = ATOMIC_INIT(3),
+ .ns.count = REFCOUNT_INIT(3),
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.ns.inum = PROC_USER_INIT_INO,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e703d5d9cbe8..af612945a4d0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -111,7 +111,7 @@ int create_user_ns(struct cred *new)
goto fail_free;
ns->ns.ops = &userns_operations;
- atomic_set(&ns->count, 1);
+ refcount_set(&ns->ns.count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
ns->level = parent_ns->level + 1;
@@ -197,7 +197,7 @@ static void free_user_ns(struct work_struct *work)
kmem_cache_free(user_ns_cachep, ns);
dec_user_namespaces(ucounts);
ns = parent;
- } while (atomic_dec_and_test(&parent->count));
+ } while (refcount_dec_and_test(&parent->ns.count));
}
void __put_user_ns(struct user_namespace *ns)
diff --git a/kernel/utsname.c b/kernel/utsname.c
index e488d0e2ab45..b1ac3ca870f2 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -33,7 +33,7 @@ static struct uts_namespace *create_uts_ns(void)
uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL);
if (uts_ns)
- kref_init(&uts_ns->kref);
+ refcount_set(&uts_ns->ns.count, 1);
return uts_ns;
}
@@ -103,11 +103,8 @@ struct uts_namespace *copy_utsname(unsigned long flags,
return new_ns;
}
-void free_uts_ns(struct kref *kref)
+void free_uts_ns(struct uts_namespace *ns)
{
- struct uts_namespace *ns;
-
- ns = container_of(kref, struct uts_namespace, kref);
dec_uts_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);