summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.freezer1
-rw-r--r--kernel/Kconfig.hz1
-rw-r--r--kernel/Kconfig.locks1
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/async.c6
-rw-r--r--kernel/audit.c42
-rw-r--r--kernel/audit.h23
-rw-r--r--kernel/audit_fsnotify.c11
-rw-r--r--kernel/audit_watch.c15
-rw-r--r--kernel/auditfilter.c77
-rw-r--r--kernel/auditsc.c42
-rw-r--r--kernel/backtracetest.c6
-rw-r--r--kernel/bpf/arraymap.c10
-rw-r--r--kernel/bpf/bpf_lru_list.c5
-rw-r--r--kernel/bpf/bpf_lru_list.h5
-rw-r--r--kernel/bpf/cgroup.c5
-rw-r--r--kernel/bpf/core.c7
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/devmap.c22
-rw-r--r--kernel/bpf/disasm.c10
-rw-r--r--kernel/bpf/disasm.h10
-rw-r--r--kernel/bpf/hashtab.c33
-rw-r--r--kernel/bpf/helpers.c10
-rw-r--r--kernel/bpf/inode.c7
-rw-r--r--kernel/bpf/lpm_trie.c14
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h5
-rw-r--r--kernel/bpf/percpu_freelist.c5
-rw-r--r--kernel/bpf/percpu_freelist.h5
-rw-r--r--kernel/bpf/stackmap.c5
-rw-r--r--kernel/bpf/syscall.c25
-rw-r--r--kernel/bpf/tnum.c1
-rw-r--r--kernel/bpf/verifier.c22
-rw-r--r--kernel/cgroup/cgroup-v1.c1
-rw-r--r--kernel/cgroup/cgroup.c125
-rw-r--r--kernel/cgroup/cpuset.c17
-rw-r--r--kernel/cgroup/pids.c5
-rw-r--r--kernel/cgroup/rdma.c5
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/compat.c5
-rw-r--r--kernel/context_tracking.c1
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpu_pm.c11
-rw-r--r--kernel/crash_core.c4
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/cred.c15
-rw-r--r--kernel/debug/Makefile1
-rw-r--r--kernel/delayacct.c11
-rw-r--r--kernel/dma/Kconfig1
-rw-r--r--kernel/dma/debug.c14
-rw-r--r--kernel/dma/swiotlb.c1
-rw-r--r--kernel/events/core.c27
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c64
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/extable.c14
-rw-r--r--kernel/fork.c75
-rw-r--r--kernel/freezer.c1
-rw-r--r--kernel/futex.c84
-rw-r--r--kernel/gcov/Kconfig1
-rwxr-xr-xkernel/gen_kheaders.sh (renamed from kernel/gen_ikh_data.sh)17
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/irq/Kconfig1
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/affinity.c12
-rw-r--r--kernel/irq/autoprobe.c6
-rw-r--r--kernel/irq/chip.c10
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/internals.h26
-rw-r--r--kernel/irq/irqdesc.c16
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c90
-rw-r--r--kernel/irq/timings.c453
-rw-r--r--kernel/irq_work.c1
-rw-r--r--kernel/jump_label.c65
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/kheaders.c40
-rw-r--r--kernel/kprobes.c15
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/latencytop.c6
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/livepatch/Makefile1
-rw-r--r--kernel/livepatch/core.c20
-rw-r--r--kernel/livepatch/patch.c14
-rw-r--r--kernel/livepatch/shadow.c14
-rw-r--r--kernel/livepatch/transition.c14
-rw-r--r--kernel/locking/Makefile2
-rw-r--r--kernel/locking/lock_events.h7
-rw-r--r--kernel/locking/lock_events_list.h12
-rw-r--r--kernel/locking/lockdep.c743
-rw-r--r--kernel/locking/lockdep_internals.h36
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/mutex.c1
-rw-r--r--kernel/locking/percpu-rwsem.c3
-rw-r--r--kernel/locking/qrwlock.c11
-rw-r--r--kernel/locking/qspinlock.c11
-rw-r--r--kernel/locking/qspinlock_stat.h10
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rwsem-xadd.c745
-rw-r--r--kernel/locking/rwsem.c1453
-rw-r--r--kernel/locking/rwsem.h306
-rw-r--r--kernel/locking/semaphore.c3
-rw-r--r--kernel/locking/test-ww_mutex.c15
-rw-r--r--kernel/memremap.c23
-rw-r--r--kernel/module-internal.h6
-rw-r--r--kernel/module.c19
-rw-r--r--kernel/module_signing.c6
-rw-r--r--kernel/notifier.c1
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pid_namespace.c1
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/energy_model.c2
-rw-r--r--kernel/power/hibernate.c16
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/poweroff.c3
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c12
-rw-r--r--kernel/power/suspend_test.c3
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h14
-rw-r--r--kernel/printk/printk.c1
-rw-r--r--kernel/printk/printk_safe.c14
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c28
-rw-r--r--kernel/rcu/Kconfig1
-rw-r--r--kernel/rcu/Kconfig.debug1
-rw-r--r--kernel/rcu/rcu.h5
-rw-r--r--kernel/rcu/rcutorture.c96
-rw-r--r--kernel/rcu/srcutree.c69
-rw-r--r--kernel/rcu/sync.c214
-rw-r--r--kernel/rcu/tree.c164
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_exp.h53
-rw-r--r--kernel/rcu/tree_plugin.h195
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/rcu/update.c13
-rw-r--r--kernel/reboot.c1
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/clock.c1
-rw-r--r--kernel/sched/core.c534
-rw-r--r--kernel/sched/cpudeadline.c10
-rw-r--r--kernel/sched/cpufreq_schedutil.c24
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cputime.c1
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c628
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/isolation.c1
-rw-r--r--kernel/sched/membarrier.c11
-rw-r--r--kernel/sched/pelt.c13
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/sched-pelt.h2
-rw-r--r--kernel/sched/sched.h134
-rw-r--r--kernel/sched/topology.c18
-rw-r--r--kernel/sched/wait.c9
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/stacktrace.c3
-rw-r--r--kernel/stop_machine.c22
-rw-r--r--kernel/sys.c62
-rw-r--r--kernel/sysctl.c61
-rw-r--r--kernel/taskstats.c12
-rw-r--r--kernel/test_kprobes.c11
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c1
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c8
-rw-r--r--kernel/time/ntp.c4
-rw-r--r--kernel/time/posix-timers.c13
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c4
-rw-r--r--kernel/time/timekeeping.c5
-rw-r--r--kernel/time/timer_list.c36
-rw-r--r--kernel/time/vsyscall.c133
-rw-r--r--kernel/torture.c23
-rw-r--r--kernel/trace/Kconfig1
-rw-r--r--kernel/trace/bpf_trace.c105
-rw-r--r--kernel/trace/ftrace.c12
-rw-r--r--kernel/trace/trace.c34
-rw-r--r--kernel/trace/trace.h19
-rw-r--r--kernel/trace/trace_events_filter.c8
-rw-r--r--kernel/trace/trace_events_hist.c13
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_kdb.c6
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_uprobe.c15
-rw-r--r--kernel/tracepoint.c15
-rw-r--r--kernel/tsacct.c13
-rw-r--r--kernel/ucount.c7
-rw-r--r--kernel/umh.c1
-rw-r--r--kernel/up.c4
-rw-r--r--kernel/user-return-notifier.c1
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c7
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/utsname_sysctl.c6
-rw-r--r--kernel/workqueue.c1
217 files changed, 5013 insertions, 3362 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..38ef6d06888e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Timer Interrupt Frequency Configuration
#
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index bf770d7556f7..e0852dc333ac 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# The ARCH_INLINE foo is necessary because select ignores "depends on"
#
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..dc0b682ec2d9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
choice
prompt "Preemption Model"
diff --git a/kernel/Makefile b/kernel/Makefile
index 33824f0385b3..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
$(obj)/kheaders_data.tar.xz: FORCE
$(call cmd,genikh)
diff --git a/kernel/async.c b/kernel/async.c
index 12c332e4e13e..4f9c1d614016 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
diff --git a/kernel/audit.c b/kernel/audit.c
index b96bf69183f4..da8dc0db5bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
* Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with Security Modules.
@@ -2274,6 +2261,33 @@ out:
}
/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int audit_signal_info(int sig, struct task_struct *t)
+{
+ kuid_t uid = current_uid(), auid;
+
+ if (auditd_test_task(t) &&
+ (sig == SIGTERM || sig == SIGHUP ||
+ sig == SIGUSR1 || sig == SIGUSR2)) {
+ audit_sig_pid = task_tgid_nr(current);
+ auid = audit_get_loginuid(current);
+ if (uid_valid(auid))
+ audit_sig_uid = auid;
+ else
+ audit_sig_uid = uid;
+ security_task_getsecid(current, &audit_sig_sid);
+ }
+
+ return audit_signal_info_syscall(t);
+}
+
+/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
diff --git a/kernel/audit.h b/kernel/audit.h
index 2071725a999f..6fb7160412d4 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,22 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
@@ -299,7 +286,7 @@ extern const char *audit_tree_path(struct audit_tree *tree);
extern void audit_put_tree(struct audit_tree *tree);
extern void audit_kill_trees(struct audit_context *context);
-extern int audit_signal_info(int sig, struct task_struct *t);
+extern int audit_signal_info_syscall(struct task_struct *t);
extern void audit_filter_inodes(struct task_struct *tsk,
struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
@@ -330,7 +317,11 @@ extern struct list_head *audit_killed_trees(void);
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
-#define audit_signal_info(s, t) AUDIT_DISABLED
+static inline int audit_signal_info_syscall(struct task_struct *t)
+{
+ return 0;
+}
+
#define audit_filter_inodes(t, c) AUDIT_DISABLED
#endif /* CONFIG_AUDITSYSCALL */
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index b5737b826951..f0d243318452 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_fsnotify.c -- tracking inodes
*
* Copyright 2003-2009,2014-2015 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/kernel.h>
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index b50c574223fa..1f31c2f1e6fc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/file.h>
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 303fb04770ce..b0126e9c0743 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op)
/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- switch(f->type) {
+ switch (f->type) {
case AUDIT_MSGTYPE:
if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
entry->rule.listnr != AUDIT_FILTER_USER)
@@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
break;
}
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
switch(f->type) {
case AUDIT_FSTYPE:
@@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
}
}
- switch(f->type) {
- default:
- return -EINVAL;
+ /* Check for valid field type and op */
+ switch (f->type) {
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_PERS: /* <uapi/linux/personality.h> */
+ case AUDIT_DEVMINOR:
+ /* all ops are valid */
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
case AUDIT_PID:
- case AUDIT_PERS:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
case AUDIT_EXIT:
case AUDIT_SUCCESS:
case AUDIT_INODE:
case AUDIT_SESSIONID:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_SADDR_FAM:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
- case AUDIT_OBJ_LEV_LOW:
- case AUDIT_OBJ_LEV_HIGH:
case AUDIT_WATCH:
case AUDIT_DIR:
case AUDIT_FILTERKEY:
- break;
case AUDIT_LOGINUID_SET:
- if ((f->val != 0) && (f->val != 1))
- return -EINVAL;
- /* FALL THROUGH */
case AUDIT_ARCH:
case AUDIT_FSTYPE:
+ case AUDIT_PERM:
+ case AUDIT_FILETYPE:
+ case AUDIT_FIELD_COMPARE:
+ case AUDIT_EXE:
+ /* only equal and not equal valid ops */
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
+ default:
+ /* field not recognized */
+ return -EINVAL;
+ }
+
+ /* Check for select valid field values */
+ switch (f->type) {
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ break;
case AUDIT_PERM:
if (f->val & ~15)
return -EINVAL;
@@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
- case AUDIT_EXE:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
+ case AUDIT_SADDR_FAM:
+ if (f->val >= AF_MAX)
return -EINVAL;
break;
+ default:
+ break;
}
+
return 0;
}
@@ -1203,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right)
case Audit_bittest:
return ((left & right) == right);
default:
- BUG();
return 0;
}
}
@@ -1226,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1249,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 95ae27edd417..4effe01ebbe2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name)
- result = audit_watch_compare(rule->watch, name->ino, name->dev);
+ if (name) {
+ result = audit_watch_compare(rule->watch,
+ name->ino,
+ name->dev);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_DIR:
- if (ctx)
+ if (ctx) {
result = match_tree_refs(ctx, rule->tree);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_LOGINUID:
result = audit_uid_comparator(audit_get_loginuid(tsk),
@@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
+ case AUDIT_SADDR_FAM:
+ if (ctx->sockaddr)
+ result = audit_comparator(ctx->sockaddr->ss_family,
+ f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -684,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_PERM:
result = audit_match_perm(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FIELD_COMPARE:
result = audit_field_compare(tsk, cred, f, ctx, name);
@@ -2360,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t)
}
/**
- * audit_signal_info - record signal info for shutting down audit subsystem
- * @sig: signal value
+ * audit_signal_info_syscall - record signal info for syscalls
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-int audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info_syscall(struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct audit_context *ctx = audit_context();
- kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
-
- if (auditd_test_task(t) &&
- (sig == SIGTERM || sig == SIGHUP ||
- sig == SIGUSR1 || sig == SIGUSR2)) {
- audit_sig_pid = task_tgid_nr(current);
- auid = audit_get_loginuid(current);
- if (uid_valid(auid))
- audit_sig_uid = auid;
- else
- audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
- }
+ kuid_t t_uid = task_uid(t);
if (!audit_signals || audit_dummy_context())
return 0;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a563c8fdad0d..a2a97fa3071b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include <linux/completion.h>
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 584636c9e2eb..262a321f58a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016,2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..1b6b9349cb85 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/cpumask.h>
#include <linux/spinlock.h>
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..f02504640e18 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fcde0f7b2585..92a7d0cf8d13 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Functions to manage eBPF programs attached to cgroups
*
* Copyright (c) 2016 Daniel Mack
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/kernel.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 242a643af82f..080e2bb644cc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux Socket Filter - Kernel level socket filtering
*
@@ -12,11 +13,6 @@
* Alexei Starovoitov <ast@plumgrid.com>
* Daniel Borkmann <dborkman@redhat.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
@@ -2101,7 +2097,6 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
-int sysctl_bpf_stats_enabled __read_mostly;
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index cf727d77c6c6..8ebd0fa826f8 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* bpf/cpumap.c
*
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
- * Released under terms in GPL version 2. See COPYING.
*/
/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..cd8297b3bdb9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
/* Devmaps primary use is as a backend map for XDP BPF helper call
@@ -164,6 +156,9 @@ static void dev_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
+ /* Make sure prior __dev_map_entry_free() have completed. */
+ rcu_barrier();
+
/* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
@@ -183,6 +178,7 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -278,6 +274,7 @@ void __dev_map_flush(struct bpf_map *map)
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
u32 bit;
+ rcu_read_lock();
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
struct xdp_bulk_queue *bq;
@@ -288,11 +285,12 @@ void __dev_map_flush(struct bpf_map *map)
if (unlikely(!dev))
continue;
- __clear_bit(bit, bitmap);
-
bq = this_cpu_ptr(dev->bulkq);
bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
+
+ __clear_bit(bit, bitmap);
}
+ rcu_read_unlock();
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -385,6 +383,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
int cpu;
+ rcu_read_lock();
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
@@ -392,6 +391,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
bq = per_cpu_ptr(dev->bulkq, cpu);
bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
}
+ rcu_read_unlock();
}
}
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index d9ce383c0f9c..b44d8c447afd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e1324a834a24..e546b18d27da 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef __BPF_DISASM_H__
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 192d32e77db3..583df5cb302d 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
@@ -527,18 +519,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf;
}
-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+ void *key, const bool mark)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l) {
- bpf_lru_node_set_ref(&l->lru_node);
+ if (mark)
+ bpf_lru_node_set_ref(&l->lru_node);
return l->key + round_up(map->key_size, 8);
}
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, true);
+}
+
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, false);
+}
+
static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
@@ -1250,6 +1254,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1281,7 +1286,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l;
void __percpu *pptr;
int ret = -ENOENT;
@@ -1297,8 +1301,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
- if (htab_is_lru(htab))
- bpf_lru_node_set_ref(&l->lru_node);
+ /* We do not mark LRU map element here in order to not mess up
+ * eviction heuristics when user space does a map walk.
+ */
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4266ffde07ca..5e28718928ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/rcupdate.h>
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index bc53e5b20ddc..cc0d0cf114e3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Minimal file system backend for holding eBPF maps and programs,
* used by bpf(2) object pinning.
@@ -5,10 +6,6 @@
* Authors:
*
* Daniel Borkmann <daniel@iogearbox.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/init.h>
@@ -518,7 +515,7 @@ out:
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ int ret = inode_permission(inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index e61630c2e50b..57b59cca4db7 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Longest prefix match list implementation
*
* Copyright (c) 2016,2017 Daniel Mack
* Copyright (c) 2016 David Herrmann
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/bpf.h>
@@ -716,9 +713,14 @@ find_leftmost:
* have exact two children, so this function will never return NULL.
*/
for (node = search_root; node;) {
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+ node = rcu_dereference(node->child[0]);
+ } else {
next_node = node;
- node = rcu_dereference(node->child[0]);
+ node = rcu_dereference(node->child[0]);
+ if (!node)
+ node = rcu_dereference(next_node->child[1]);
+ }
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3dff41403583..fab4fb134547 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/slab.h>
#include <linux/bpf.h>
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9ec08c..a507bf6ef8b9 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __MAP_IN_MAP_H__
#define __MAP_IN_MAP_H__
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 0c1b4ba9e90e..6e090140b924 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include "percpu_freelist.h"
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index c3960118e617..fbf8a8a28979 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __PERCPU_FREELIST_H__
#define __PERCPU_FREELIST_H__
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..d38e49f943a1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <linux/jhash.h>
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ad3ccf82f31d..5b30f8baaf02 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
@@ -808,7 +800,10 @@ static int map_lookup_elem(union bpf_attr *attr)
err = map->ops->map_peek_elem(map, value);
} else {
rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
} else if (!ptr) {
@@ -1578,6 +1573,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
return 0;
default:
return -EINVAL;
@@ -1671,7 +1668,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (err < 0)
goto free_prog;
- prog->aux->load_time = ktime_get_boot_ns();
+ prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
if (err)
goto free_prog;
@@ -1872,6 +1869,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1957,6 +1956,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -2008,6 +2009,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 938d41211be7..ca52b9642943 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* tnum: tracked (or tristate) numbers
*
* A tnum tracks knowledge about the bits of a value. Each bit can be either
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..a5c369e60343 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <uapi/linux/btf.h>
#include <linux/kernel.h>
@@ -5361,9 +5353,12 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ range = tnum_range(1, 1);
case BPF_PROG_TYPE_CGROUP_SKB:
case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
@@ -5380,16 +5375,17 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
+ char tn_buf[48];
+
verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "has value %s", tn_buf);
} else {
verbose(env, "has unknown scalar value");
}
- verbose(env, " should have been 0 or 1\n");
+ tnum_strn(tn_buf, sizeof(tn_buf), range);
+ verbose(env, " should have been in %s\n", tn_buf);
return -EINVAL;
}
return 0;
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 68ca5de7ec27..88006be40ea3 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/ctype.h>
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217cec4e22c6..cdbeff87fa99 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
@@ -738,6 +739,7 @@ struct css_set init_css_set = {
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
cgroup_update_populated(link->cgrp, populated);
}
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position. Advancing an iterator might
+ * remove it from the list, use safe walk. See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+ struct task_struct *task)
+{
+ struct css_task_iter *it, *pos;
+
+ list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+ css_task_iter_skip(it, task);
+}
+
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
@@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
css_set_update_populated(to_cset, true);
if (from_cset) {
- struct css_task_iter *it, *pos;
-
WARN_ON_ONCE(list_empty(&task->cg_list));
- /*
- * @task is leaving, advance task iterators which are
- * pointing to it so that they can resume at the next
- * position. Advancing an iterator might remove it from
- * the list, use safe walk. See css_task_iter_advance*()
- * for details.
- */
- list_for_each_entry_safe(it, pos, &from_cset->task_iters,
- iters_node)
- if (it->task_pos == &task->cg_list)
- css_task_iter_advance(it);
-
+ css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
@@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
@@ -1810,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_memory_localevents,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_param_specs[] = {
- fsparam_flag ("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
{}
};
@@ -1837,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_memory_localevents:
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
@@ -1848,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
}
}
@@ -1855,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ seq_puts(seq, ",memory_localevents");
return 0;
}
@@ -4396,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
it->task_pos = NULL;
return;
}
- } while (!css_set_populated(cset));
+ } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
- else
+ else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next;
+ else
+ it->task_pos = cset->dying_tasks.next;
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+ it->dying_tasks_head = &cset->dying_tasks;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4430,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
list_add(&it->iters_node, &cset->task_iters);
}
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (it->task_pos == &task->cg_list) {
+ it->task_pos = it->task_pos->next;
+ it->flags |= CSS_TASK_ITER_SKIPPED;
+ }
+}
+
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *next;
+ struct task_struct *task;
lockdep_assert_held(&css_set_lock);
repeat:
@@ -4442,25 +4473,40 @@ repeat:
* consumed first and then ->mg_tasks. After ->mg_tasks,
* we move onto the next cset.
*/
- next = it->task_pos->next;
-
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ it->flags &= ~CSS_TASK_ITER_SKIPPED;
+ else
+ it->task_pos = it->task_pos->next;
- if (next == it->mg_tasks_head)
+ if (it->task_pos == it->tasks_head)
+ it->task_pos = it->mg_tasks_head->next;
+ if (it->task_pos == it->mg_tasks_head)
+ it->task_pos = it->dying_tasks_head->next;
+ if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
- /* if PROCS, skip over tasks which aren't group leaders */
- if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
- !thread_group_leader(list_entry(it->task_pos, struct task_struct,
- cg_list)))
- goto repeat;
+ if (!it->task_pos)
+ return;
+
+ task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+ if (it->flags & CSS_TASK_ITER_PROCS) {
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if (!thread_group_leader(task))
+ goto repeat;
+
+ /* and dying leaders w/o live member threads */
+ if (!atomic_read(&task->signal->live))
+ goto repeat;
+ } else {
+ /* skip all dying ones */
+ if (task->flags & PF_EXITING)
+ goto repeat;
+ }
}
/**
@@ -4516,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
spin_lock_irq(&css_set_lock);
+ /* @it may be half-advanced by skips, finish advancing */
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ css_task_iter_advance(it);
+
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
@@ -5616,7 +5666,6 @@ int __init cgroup_init(void)
int ssid;
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
@@ -5997,6 +6046,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
@@ -6022,6 +6072,13 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
+
+ if (use_task_css_set_links) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
}
void cgroup_free(struct task_struct *task)
@@ -6325,7 +6382,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..a1590e244f5f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
if (task_css_is_root(task, cpuset_cgrp_id))
return;
- set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
spin_unlock_irqrestore(&callback_lock, flags);
}
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ **/
+
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
rcu_read_lock();
- do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+ do_set_cpus_allowed(tsk, is_in_v2_mode() ?
+ task_cs(tsk)->cpus_allowed : cpu_possible_mask);
rcu_read_unlock();
/*
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index c9960baaa14f..8e513a573fe9 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Process number limiting controller for cgroups.
*
@@ -25,10 +26,6 @@
* a superset of parent/child/pids.current.
*
* Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/kernel.h>
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 1d75ae7f1cb7..ae042c347c64 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RDMA resource limiting controller for cgroups.
*
@@ -5,10 +6,6 @@
* additional RDMA resources after a certain limit is reached.
*
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/bitops.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index bb95a35e8c2d..ca19b4c8acf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/sched/cputime.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index b5f7063c0db6..a2bc1d6ceb57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/compat.c
*
@@ -5,10 +6,6 @@
* on 64 bit kernels.
*
* Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 9ad37b9e44a7..be01a4d627c9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Context tracking: Probe on high level context boundaries such as kernel
* and userspace. This includes syscalls and exceptions entry/exit.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f2ef10460698..e84c0873559e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
/*
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
- * CPU marked itself as booted_once in cpu_notify_starting() so the
+ * CPU marked itself as booted_once in notify_cpu_starting() so the
* cpu_smt_allowed() check will now return false if this is not the
* primary sibling.
*/
@@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary)
for_each_online_cpu(cpu) {
if (cpu == primary)
continue;
+
+ if (pm_wakeup_pending()) {
+ pr_info("Wakeup pending. Abort CPU freeze\n");
+ error = -EBUSY;
+ break;
+ }
+
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
@@ -1964,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (ret)
return ret;
+ if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
+ return -EINVAL;
+
/*
* Cannot fail STARTING/DYING callbacks.
*/
@@ -2061,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
-static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
@@ -2093,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
return ret;
}
-static int cpuhp_smt_enable(void)
+int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
@@ -2339,6 +2349,9 @@ static int __init mitigations_parse_cmdline(char *arg)
cpu_mitigations = CPU_MITIGATIONS_AUTO;
else if (!strcmp(arg, "auto,nosmt"))
cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+ else
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
+ arg);
return 0;
}
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 67b02e138a47..cbca6879ab7d 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2011 Google, Inc.
*
* Author:
* Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 093c9f917ed0..9f1557b98468 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* crash.c - kernel crash support code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/crash_core.h>
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index b64e238b553b..9c23ae074b40 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/crash_dump.h>
#include <linux/init.h>
diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..c73a87a4df13 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/export.h>
#include <linux/cred.h>
@@ -450,6 +446,15 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ /*
+ * If a task drops privileges and becomes nondumpable,
+ * the dumpability change must become visible before
+ * the credential change; otherwise, a __ptrace_may_access()
+ * racing with this change may be able to attach to a task it
+ * shouldn't be able to attach to (as if the task had dropped
+ * privileges without becoming nondumpable).
+ * Pairs with a read barrier in __ptrace_may_access().
+ */
smp_wmb();
}
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the linux kernel debugger
#
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2a12b988c717..27725754ac99 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#include <linux/sched.h>
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 83d711f8d665..70f8f8d9200e 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAS_DMA
bool
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index badd77670d00..099002d84f46 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008 Advanced Micro Devices, Inc.
*
* Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "DMA-API: " fmt
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 6f7619c1f877..13f0cb080a4d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Dynamic DMA mapping support.
*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3b96c2..29e5f7880a4b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5005,6 +5005,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (perf_event_check_period(event, value))
return -EINVAL;
+ if (!event->attr.freq && (value & (1ULL << 63)))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
@@ -5923,7 +5926,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (current->mm) {
+ } else if (!(current->flags & PF_KTHREAD)) {
perf_get_regs_user(regs_user, regs, regs_user_copy);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -10033,6 +10036,12 @@ void perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static inline bool has_extended_regs(struct perf_event *event)
+{
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
struct perf_event_context *ctx = NULL;
@@ -10064,12 +10073,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
perf_event_ctx_unlock(event->group_leader, ctx);
if (!ret) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event))
+ ret = -EOPNOTSUPP;
+
if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event)) {
- if (event->destroy)
- event->destroy(event);
+ event_has_any_exclude_flag(event))
ret = -EINVAL;
- }
+
+ if (ret && event->destroy)
+ event->destroy(event);
}
if (ret)
@@ -10680,11 +10693,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
break;
case CLOCK_BOOTTIME:
- event->clock = &ktime_get_boot_ns;
+ event->clock = &ktime_get_boottime_ns;
break;
case CLOCK_TAI:
- event->clock = &ktime_get_tai_ns;
+ event->clock = &ktime_get_clocktai_ns;
break;
default:
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 79c47076700a..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -24,7 +24,7 @@ struct ring_buffer {
atomic_t poll; /* POLL_ for wakeups */
local_t head; /* write position */
- local_t nest; /* nested writers */
+ unsigned int nest; /* nested writers */
local_t events; /* event limit */
local_t wakeup; /* wakeup stamp */
local_t lost; /* nr records lost */
@@ -41,7 +41,7 @@ struct ring_buffer {
/* AUX area */
long aux_head;
- local_t aux_nest;
+ unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 674b35383491..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
struct ring_buffer *rb = handle->rb;
preempt_disable();
- local_inc(&rb->nest);
+
+ /*
+ * Avoid an explicit LOAD/STORE such that architectures with memops
+ * can use them.
+ */
+ (*(volatile unsigned int *)&rb->nest)++;
handle->wakeup = local_read(&rb->wakeup);
}
@@ -46,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct ring_buffer *rb = handle->rb;
unsigned long head;
+ unsigned int nest;
+
+ /*
+ * If this isn't the outermost nesting, we don't have to update
+ * @rb->user_page->data_head.
+ */
+ nest = READ_ONCE(rb->nest);
+ if (nest > 1) {
+ WRITE_ONCE(rb->nest, nest - 1);
+ goto out;
+ }
again:
+ /*
+ * In order to avoid publishing a head value that goes backwards,
+ * we must ensure the load of @rb->head happens after we've
+ * incremented @rb->nest.
+ *
+ * Otherwise we can observe a @rb->head value before one published
+ * by an IRQ/NMI happening between the load and the increment.
+ */
+ barrier();
head = local_read(&rb->head);
/*
- * IRQ/NMI can happen here, which means we can miss a head update.
+ * IRQ/NMI can happen here and advance @rb->head, causing our
+ * load above to be stale.
*/
- if (!local_dec_and_test(&rb->nest))
- goto out;
-
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
@@ -84,14 +107,23 @@ again:
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
- rb->user_page->data_head = head;
+ WRITE_ONCE(rb->user_page->data_head, head);
/*
- * Now check if we missed an update -- rely on previous implied
- * compiler barriers to force a re-read.
+ * We must publish the head before decrementing the nest count,
+ * otherwise an IRQ/NMI can publish a more recent head value and our
+ * write will (temporarily) publish a stale value.
*/
+ barrier();
+ WRITE_ONCE(rb->nest, 0);
+
+ /*
+ * Ensure we decrement @rb->nest before we validate the @rb->head.
+ * Otherwise we cannot be sure we caught the 'last' nested update.
+ */
+ barrier();
if (unlikely(head != local_read(&rb->head))) {
- local_inc(&rb->nest);
+ WRITE_ONCE(rb->nest, 1);
goto again;
}
@@ -330,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
struct ring_buffer *rb;
+ unsigned int nest;
if (output_event->parent)
output_event = output_event->parent;
@@ -360,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
+ nest = READ_ONCE(rb->aux_nest);
/*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
- if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ if (WARN_ON_ONCE(nest))
goto err_put;
+ WRITE_ONCE(rb->aux_nest, nest + 1);
+
aux_head = rb->aux_head;
handle->rb = rb;
@@ -394,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!handle->size) { /* A, matches D */
event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
}
}
@@ -471,7 +507,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
perf_event_aux_event(handle->event, aux_head, size,
handle->aux_flags);
- rb->user_page->aux_head = rb->aux_head;
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb))
wakeup = true;
@@ -483,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->event = NULL;
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
/* can't be last */
rb_free_aux(rb);
ring_buffer_put(rb);
@@ -503,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
- rb->user_page->aux_head = rb->aux_head;
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 78f61bfc6b79..97c367f0a9aa 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-static struct percpu_rw_semaphore dup_mmap_sem;
+DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -2302,7 +2302,5 @@ void __init uprobes_init(void)
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
-
BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 8361a560cd1d..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/exit.c
*
@@ -194,6 +195,7 @@ repeat:
rcu_read_unlock();
proc_flush_task(p);
+ cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -219,7 +221,6 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
- cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..e23cce6e6092 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index b4cba953040a..847dd147b068 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/fork.c
*
@@ -122,7 +123,7 @@
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
-int max_threads; /* tunable limit on nr_threads */
+static int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
@@ -247,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
- return page ? page_address(page) : NULL;
+ if (likely(page)) {
+ tsk->stack = page_address(page);
+ return tsk->stack;
+ }
+ return NULL;
#endif
}
@@ -893,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
@@ -1711,31 +1718,6 @@ const struct file_operations pidfd_fops = {
#endif
};
-/**
- * pidfd_create() - Create a new pid file descriptor.
- *
- * @pid: struct pid that the pidfd will reference
- *
- * This creates a new pid file descriptor with the O_CLOEXEC flag set.
- *
- * Note, that this function can only be called after the fd table has
- * been unshared to avoid leaking the pidfd to the new process.
- *
- * Return: On success, a cloexec pidfd is returned.
- * On error, a negative errno number will be returned.
- */
-static int pidfd_create(struct pid *pid)
-{
- int fd;
-
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
-
- return fd;
-}
-
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -1773,6 +1755,7 @@ static __latent_entropy struct task_struct *copy_process(
int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
+ struct file *pidfile = NULL;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1821,8 +1804,6 @@ static __latent_entropy struct task_struct *copy_process(
}
if (clone_flags & CLONE_PIDFD) {
- int reserved;
-
/*
* - CLONE_PARENT_SETTID is useless for pidfds and also
* parent_tidptr is used to return pidfds.
@@ -1833,16 +1814,6 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags &
(CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
return ERR_PTR(-EINVAL);
-
- /*
- * Verify that parent_tidptr is sane so we can potentially
- * reuse it later.
- */
- if (get_user(reserved, parent_tidptr))
- return ERR_PTR(-EFAULT);
-
- if (reserved != 0)
- return ERR_PTR(-EINVAL);
}
/*
@@ -1983,9 +1954,6 @@ static __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
- p->lockdep_depth = 0; /* no locks held yet */
- p->curr_chain_key = 0;
- p->lockdep_recursion = 0;
lockdep_init_task(p);
#endif
@@ -2057,11 +2025,21 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = pidfd_create(pid);
+ retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
+
+ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfile)) {
+ put_unused_fd(pidfd);
+ retval = PTR_ERR(pidfile);
+ goto bad_fork_free_pid;
+ }
+ get_pid(pid); /* held by pidfile now */
+
retval = put_user(pidfd, parent_tidptr);
if (retval)
goto bad_fork_put_pidfd;
@@ -2138,7 +2116,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boot_ns();
+ p->real_start_time = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
@@ -2179,6 +2157,9 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
+ /* past the last point of failure */
+ if (pidfile)
+ fd_install(pidfd, pidfile);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2245,8 +2226,10 @@ bad_fork_cancel_cgroup:
bad_fork_cgroup_threadgroup_change_end:
cgroup_threadgroup_change_end(current);
bad_fork_put_pidfd:
- if (clone_flags & CLONE_PIDFD)
- ksys_close(pidfd);
+ if (clone_flags & CLONE_PIDFD) {
+ fput(pidfile);
+ put_unused_fd(pidfd);
+ }
bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b162b74611e4..c0738424bb43 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/freezer.c - Function to freeze a process
*
diff --git a/kernel/futex.c b/kernel/futex.c
index 2268b97d5439..6d50728ef2e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Fast Userspace Mutexes (which I call "Futexes!").
* (C) Rusty Russell, IBM 2002
@@ -29,20 +30,6 @@
*
* "The futexes are also cursed."
* "But they come in a choice of three flavours!"
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/compat.h>
#include <linux/slab.h>
@@ -484,6 +471,37 @@ enum futex_access {
};
/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+static inline struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(timeout, current);
+
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
@@ -2692,7 +2710,7 @@ out:
static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
@@ -2702,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
return -EINVAL;
q.bitset = bitset;
- if (abs_time) {
- to = &timeout;
-
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
-
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
retry:
/*
* Prepare to wait on uaddr. On success, holds hb lock and increments
@@ -2792,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart)
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
@@ -2805,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
if (refill_pi_state_cache())
return -ENOMEM;
- if (time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires(&to->timer, *time);
- }
+ to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
@@ -3208,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
u32 __user *uaddr2)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
@@ -3225,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (!bitset)
return -EINVAL;
- if (abs_time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
/*
* The waiter is allocated on our stack, manipulated by the requeue
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index f71c1adcff31..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "GCOV-based kernel profiling"
config GCOV_KERNEL
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_kheaders.sh
index 591a94f7b387..9a34e1d9bd7f 100755
--- a/kernel/gen_ikh_data.sh
+++ b/kernel/gen_kheaders.sh
@@ -2,7 +2,7 @@
# SPDX-License-Identifier: GPL-2.0
# This script generates an archive consisting of kernel headers
-# for CONFIG_IKHEADERS_PROC.
+# for CONFIG_IKHEADERS.
set -e
spath="$(dirname "$(readlink -f "$0")")"
kroot="$spath/.."
@@ -31,9 +31,8 @@ arch/$SRCARCH/include/
# This block is useful for debugging the incremental builds.
# Uncomment it for debugging.
-# iter=1
-# if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter;
-# else; iter=$(($(cat /tmp/iter) + 1)); fi
+# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
+# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
@@ -43,10 +42,18 @@ arch/$SRCARCH/include/
pushd $kroot > /dev/null
src_files_md5="$(find $src_file_list -type f |
grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
+ grep -v "include/config/auto.conf" |
+ grep -v "include/config/auto.conf.cmd" |
+ grep -v "include/config/tristate.conf" |
xargs ls -lR | md5sum | cut -d ' ' -f1)"
popd > /dev/null
obj_files_md5="$(find $obj_file_list -type f |
grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
+ grep -v "include/config/auto.conf" |
+ grep -v "include/config/auto.conf.cmd" |
+ grep -v "include/config/tristate.conf" |
xargs ls -lR | md5sum | cut -d ' ' -f1)"
if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
@@ -82,7 +89,7 @@ find $cpio_dir -type f -print0 |
tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
-echo "$src_files_md5" > kernel/kheaders.md5
+echo "$src_files_md5" > kernel/kheaders.md5
echo "$obj_files_md5" >> kernel/kheaders.md5
echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95882c6..14a625c16cb3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Detect Hung Task
*
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8fee06625c37..f92d9a687372 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "IRQ subsystem"
# Options selectable by the architecture code
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index ff6e352e3a6c..b4f53717d143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,9 @@
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
+ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
+ CFLAGS_timings.o += -DDEBUG
+endif
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..4352b08ae48d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
-static int __irq_build_affinity_masks(const struct irq_affinity *affd,
- unsigned int startvec,
+static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
@@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
- unsigned int startvec, unsigned int numvecs,
+static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
{
@@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+ nr_present = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
cpu_present_mask, nmsk, masks);
@@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+ nr_others = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
npresmsk, nmsk, masks);
put_online_cpus();
@@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;
- ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+ ret = irq_build_affinity_masks(curvec, this_vecs,
curvec, masks);
if (ret) {
kfree(masks);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 16cbf6beb276..ae60cae24e9a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -90,7 +90,7 @@ unsigned long probe_irq_on(void)
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
} else
if (i < 32)
mask |= 1 << i;
@@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val)
mask |= 1 << i;
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val)
nr_of_irqs++;
}
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 29d6c7d070b4..b76703b2c0af 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc)
}
irq_state_clr_started(desc);
}
+}
+
+
+void irq_shutdown_and_deactivate(struct irq_desc *desc)
+{
+ irq_shutdown(desc);
/*
* This must be called even if the interrupt was never started up,
* because the activation can happen before the interrupt is
@@ -748,6 +754,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
+ __kstat_incr_irqs_this_cpu(desc);
+
trace_irq_handler_entry(irq, action);
/*
* NMIs cannot be shared, there is only one action.
@@ -962,6 +970,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
+ __kstat_incr_irqs_this_cpu(desc);
+
trace_irq_handler_entry(irq, action);
res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
trace_irq_handler_exit(irq, action, res);
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 5b1072e394b2..6c7ca2e983a5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
*/
if (irqd_affinity_is_managed(d)) {
irqd_set_managed_shutdown(d);
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
return false;
}
affinity = cpu_online_mask;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 70c3053bc1f6..3924fbe829d4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
@@ -96,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { }
extern void irq_mark_irq(unsigned int irq);
#endif
+extern int __irq_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool *state);
+
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
@@ -354,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp)
return value & U16_MAX;
}
+static __always_inline void irq_timings_push(u64 ts, int irq)
+{
+ struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+
+ timings->values[timings->count & IRQ_TIMINGS_MASK] =
+ irq_timing_encode(ts, irq);
+
+ timings->count++;
+}
+
/*
* The function record_irq_time is only called in one place in the
* interrupts handler. We want this function always inline so the code
@@ -367,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc)
if (!static_branch_likely(&irq_timing_enabled))
return;
- if (desc->istate & IRQS_TIMINGS) {
- struct irq_timings *timings = this_cpu_ptr(&irq_timings);
-
- timings->values[timings->count & IRQ_TIMINGS_MASK] =
- irq_timing_encode(local_clock(),
- irq_desc_get_irq(desc));
-
- timings->count++;
- }
+ if (desc->istate & IRQS_TIMINGS)
+ irq_timings_push(local_clock(), irq_desc_get_irq(desc));
}
#else
static inline void irq_remove_timings(struct irq_desc *desc) {}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index c52b737ab8e3..9484e88dabc2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
* @hwirq: The HW irq number to convert to a logical one
* @regs: Register file coming from the low-level handling code
*
+ * This function must be called from an NMI context.
+ *
* Returns: 0 on success, or -EINVAL if conversion has failed
*/
int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
@@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
unsigned int irq;
int ret = 0;
- nmi_enter();
+ /*
+ * NMI context needs to be setup earlier in order to deal with tracing.
+ */
+ WARN_ON(!in_nmi());
irq = irq_find_mapping(domain, hwirq);
@@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
else
ret = -EINVAL;
- nmi_exit();
set_irq_regs(old_regs);
return ret;
}
@@ -946,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
+static bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
/**
* kstat_irqs - Get the statistics for an interrupt
* @irq: The interrupt number
@@ -963,7 +972,8 @@ unsigned int kstat_irqs(unsigned int irq)
if (!desc || !desc->kstat_irqs)
return 0;
if (!irq_settings_is_per_cpu_devid(desc) &&
- !irq_settings_is_per_cpu(desc))
+ !irq_settings_is_per_cpu(desc) &&
+ !irq_is_nmi(desc))
return desc->tot_count;
for_each_possible_cpu(cpu)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index a453e229f99c..3078d0e48bba 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
* @ops: domain callbacks
* @host_data: Controller private data pointer
*
- * Allocates and initialize and irq_domain structure.
+ * Allocates and initializes an irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
@@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
- if (WARN_ON(!domain))
+ if (!domain)
return NULL;
if (fwnode && is_fwnode_irqchip(fwnode)) {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 78f3ddeb7fe4..e8f7f179bf77 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
-static void __synchronize_hardirq(struct irq_desc *desc)
+static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
+ struct irq_data *irqd = irq_desc_get_irq_data(desc);
bool inprogress;
do {
@@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
/* Ok, that indicated we're done: double-check carefully. */
raw_spin_lock_irqsave(&desc->lock, flags);
inprogress = irqd_irq_inprogress(&desc->irq_data);
+
+ /*
+ * If requested and supported, check at the chip whether it
+ * is in flight at the hardware level, i.e. already pending
+ * in a CPU and waiting for service and acknowledge.
+ */
+ if (!inprogress && sync_chip) {
+ /*
+ * Ignore the return code. inprogress is only updated
+ * when the chip supports it.
+ */
+ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
+ &inprogress);
+ }
raw_spin_unlock_irqrestore(&desc->lock, flags);
/* Oops, that failed? */
@@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Returns: false if a threaded handler is active.
*
* This function may be called - with care - from IRQ context.
+ *
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when
+ * called with interrupts disabled and the target CPU of the interrupt
+ * is the current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, false);
return !atomic_read(&desc->threads_active);
}
@@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, true);
/*
* We made sure that no hardirq handler is
* running. Now verify that no threaded handlers are
@@ -1699,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_settings_clr_disable_unlazy(desc);
+ /* Only shutdown. Deactivate after synchronize_hardirq() */
irq_shutdown(desc);
}
@@ -1727,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
unregister_handler_proc(irq, action);
- /* Make sure it's not being used on another CPU: */
- synchronize_hardirq(irq);
+ /*
+ * Make sure it's not being used on another CPU and if the chip
+ * supports it also make sure that there is no (not yet serviced)
+ * interrupt in flight at the hardware level.
+ */
+ __synchronize_hardirq(desc, true);
#ifdef CONFIG_DEBUG_SHIRQ
/*
@@ -1768,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* require it to deallocate resources over the slow bus.
*/
chip_bus_lock(desc);
+ /*
+ * There is no interrupt on the fly anymore. Deactivate it
+ * completely.
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ irq_domain_deactivate_irq(&desc->irq_data);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
irq_remove_timings(desc);
@@ -1855,7 +1894,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
}
irq_settings_clr_disable_unlazy(desc);
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
irq_release_resources(desc);
@@ -2578,6 +2617,28 @@ out:
irq_put_desc_unlock(desc, flags);
}
+int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_chip *chip;
+ int err = -EINVAL;
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+ return err;
+}
+
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
@@ -2596,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
{
struct irq_desc *desc;
struct irq_data *data;
- struct irq_chip *chip;
unsigned long flags;
int err = -EINVAL;
@@ -2606,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
data = irq_desc_get_irq_data(desc);
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip->irq_get_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
-
- if (data)
- err = chip->irq_get_irqchip_state(data, which, state);
+ err = __irq_get_irqchip_state(data, which, state);
irq_put_desc_busunlock(desc, flags);
return err;
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 90c735da15d0..e960d7ce7bcc 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,10 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+#define pr_fmt(fmt) "irq_timings: " fmt
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/static_key.h>
+#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/idr.h>
#include <linux/irq.h>
@@ -261,12 +263,29 @@ void irq_timings_disable(void)
#define EMA_ALPHA_VAL 64
#define EMA_ALPHA_SHIFT 7
-#define PREDICTION_PERIOD_MIN 2
+#define PREDICTION_PERIOD_MIN 3
#define PREDICTION_PERIOD_MAX 5
#define PREDICTION_FACTOR 4
#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts) \
+ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
+ 0 : irqts->count & IRQ_TIMINGS_MASK, \
+ irqts->count = min(IRQ_TIMINGS_SIZE, \
+ irqts->count); \
+ irqts->count > 0; irqts->count--, \
+ i = (i + 1) & IRQ_TIMINGS_MASK)
+
struct irqt_stat {
u64 last_ts;
u64 ema_time[PREDICTION_BUFFER_SIZE];
@@ -297,7 +316,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old)
static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
{
- int i;
+ int period;
+
+ /*
+ * Move the beginning pointer to the end minus the max period x 3.
+ * We are at the point we can begin searching the pattern
+ */
+ buffer = &buffer[len - (period_max * 3)];
+
+ /* Adjust the length to the maximum allowed period x 3 */
+ len = period_max * 3;
/*
* The buffer contains the suite of intervals, in a ilog2
@@ -306,21 +334,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
* period beginning at the end of the buffer. We do that for
* each suffix.
*/
- for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
- int *begin = &buffer[len - (i * 3)];
- int *ptr = begin;
+ /*
+ * The first comparison always succeed because the
+ * suffix is deduced from the first n-period bytes of
+ * the buffer and we compare the initial suffix with
+ * itself, so we can skip the first iteration.
+ */
+ int idx = period;
+ size_t size = period;
/*
* We look if the suite with period 'i' repeat
* itself. If it is truncated at the end, as it
* repeats we can use the period to find out the next
- * element.
+ * element with the modulo.
*/
- while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
- ptr += i;
- if (ptr >= &buffer[len])
- return begin[((i * 3) % i)];
+ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
+
+ /*
+ * Move the index in a period basis
+ */
+ idx += size;
+
+ /*
+ * If this condition is reached, all previous
+ * memcmp were successful, so the period is
+ * found.
+ */
+ if (idx == len)
+ return buffer[len % period];
+
+ /*
+ * If the remaining elements to compare are
+ * smaller than the period, readjust the size
+ * of the comparison for the last iteration.
+ */
+ if (len - idx < period)
+ size = len - idx;
}
}
@@ -380,11 +432,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
return irqs->last_ts + irqs->ema_time[index];
}
+static __always_inline int irq_timings_interval_index(u64 interval)
+{
+ /*
+ * The PREDICTION_FACTOR increase the interval size for the
+ * array of exponential average.
+ */
+ u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
+
+ return likely(interval_us) ? ilog2(interval_us) : 0;
+}
+
+static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
+ u64 interval)
+{
+ int index;
+
+ /*
+ * Get the index in the ema table for this interrupt.
+ */
+ index = irq_timings_interval_index(interval);
+
+ /*
+ * Store the index as an element of the pattern in another
+ * circular array.
+ */
+ irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
+
+ irqs->ema_time[index] = irq_timings_ema_new(interval,
+ irqs->ema_time[index]);
+
+ irqs->count++;
+}
+
static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
{
u64 old_ts = irqs->last_ts;
u64 interval;
- int index;
/*
* The timestamps are absolute time values, we need to compute
@@ -415,24 +499,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
return;
}
- /*
- * Get the index in the ema table for this interrupt. The
- * PREDICTION_FACTOR increase the interval size for the array
- * of exponential average.
- */
- index = likely(interval) ?
- ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
-
- /*
- * Store the index as an element of the pattern in another
- * circular array.
- */
- irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-
- irqs->ema_time[index] = irq_timings_ema_new(interval,
- irqs->ema_time[index]);
-
- irqs->count++;
+ __irq_timings_store(irq, irqs, interval);
}
/**
@@ -493,11 +560,7 @@ u64 irq_timings_next_event(u64 now)
* model while decrementing the counter because we consume the
* data from our circular buffer.
*/
-
- i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
- irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-
- for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+ for_each_irqts(i, irqts) {
irq = irq_timing_decode(irqts->values[i], &ts);
s = idr_find(&irqt_stats, irq);
if (s)
@@ -564,3 +627,325 @@ int irq_timings_alloc(int irq)
return 0;
}
+
+#ifdef CONFIG_TEST_IRQ_TIMINGS
+struct timings_intervals {
+ u64 *intervals;
+ size_t count;
+};
+
+/*
+ * Intervals are given in nanosecond base
+ */
+static u64 intervals0[] __initdata = {
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000,
+};
+
+static u64 intervals1[] __initdata = {
+ 223947000, 1240000, 1384000, 1386000, 1386000,
+ 217416000, 1236000, 1384000, 1386000, 1387000,
+ 214719000, 1241000, 1386000, 1387000, 1384000,
+ 213696000, 1234000, 1384000, 1386000, 1388000,
+ 219904000, 1240000, 1385000, 1389000, 1385000,
+ 212240000, 1240000, 1386000, 1386000, 1386000,
+ 214415000, 1236000, 1384000, 1386000, 1387000,
+ 214276000, 1234000,
+};
+
+static u64 intervals2[] __initdata = {
+ 4000, 3000, 5000, 100000,
+ 3000, 3000, 5000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 5000, 3000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 4000, 5000, 112000,
+ 4000,
+};
+
+static u64 intervals3[] __initdata = {
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+};
+
+static u64 intervals4[] __initdata = {
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000,
+};
+
+static struct timings_intervals tis[] __initdata = {
+ { intervals0, ARRAY_SIZE(intervals0) },
+ { intervals1, ARRAY_SIZE(intervals1) },
+ { intervals2, ARRAY_SIZE(intervals2) },
+ { intervals3, ARRAY_SIZE(intervals3) },
+ { intervals4, ARRAY_SIZE(intervals4) },
+};
+
+static int __init irq_timings_test_next_index(struct timings_intervals *ti)
+{
+ int _buffer[IRQ_TIMINGS_SIZE];
+ int buffer[IRQ_TIMINGS_SIZE];
+ int index, start, i, count, period_max;
+
+ count = ti->count - 1;
+
+ period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : count / 3;
+
+ /*
+ * Inject all values except the last one which will be used
+ * to compare with the next index result.
+ */
+ pr_debug("index suite: ");
+
+ for (i = 0; i < count; i++) {
+ index = irq_timings_interval_index(ti->intervals[i]);
+ _buffer[i & IRQ_TIMINGS_MASK] = index;
+ pr_cont("%d ", index);
+ }
+
+ start = count < IRQ_TIMINGS_SIZE ? 0 :
+ count & IRQ_TIMINGS_MASK;
+
+ count = min_t(int, count, IRQ_TIMINGS_SIZE);
+
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+ buffer[i] = _buffer[index];
+ }
+
+ index = irq_timings_next_event_index(buffer, count, period_max);
+ i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
+
+ if (index != i) {
+ pr_err("Expected (%d) and computed (%d) next indexes differ\n",
+ i, index);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init irq_timings_next_index_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+
+ ret = irq_timings_test_next_index(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqs(struct timings_intervals *ti)
+{
+ struct irqt_stat __percpu *s;
+ struct irqt_stat *irqs;
+ int i, index, ret, irq = 0xACE5;
+
+ ret = irq_timings_alloc(irq);
+ if (ret) {
+ pr_err("Failed to allocate irq timings\n");
+ return ret;
+ }
+
+ s = idr_find(&irqt_stats, irq);
+ if (!s) {
+ ret = -EIDRM;
+ goto out;
+ }
+
+ irqs = this_cpu_ptr(s);
+
+ for (i = 0; i < ti->count; i++) {
+
+ index = irq_timings_interval_index(ti->intervals[i]);
+ pr_debug("%d: interval=%llu ema_index=%d\n",
+ i, ti->intervals[i], index);
+
+ __irq_timings_store(irq, irqs, ti->intervals[i]);
+ if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ pr_err("Failed to store in the circular buffer\n");
+ goto out;
+ }
+ }
+
+ if (irqs->count != ti->count) {
+ pr_err("Count differs\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ irq_timings_free(irq);
+
+ return ret;
+}
+
+static int __init irq_timings_irqs_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+ ret = irq_timings_test_irqs(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqts(struct irq_timings *irqts,
+ unsigned count)
+{
+ int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
+ int i, irq, oirq = 0xBEEF;
+ u64 ots = 0xDEAD, ts;
+
+ /*
+ * Fill the circular buffer by using the dedicated function.
+ */
+ for (i = 0; i < count; i++) {
+ pr_debug("%d: index=%d, ts=%llX irq=%X\n",
+ i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
+
+ irq_timings_push(ots + i, oirq + i);
+ }
+
+ /*
+ * Compute the first elements values after the index wrapped
+ * up or not.
+ */
+ ots += start;
+ oirq += start;
+
+ /*
+ * Test the circular buffer count is correct.
+ */
+ pr_debug("---> Checking timings array count (%d) is right\n", count);
+ if (WARN_ON(irqts->count != count))
+ return -EINVAL;
+
+ /*
+ * Test the macro allowing to browse all the irqts.
+ */
+ pr_debug("---> Checking the for_each_irqts() macro\n");
+ for_each_irqts(i, irqts) {
+
+ irq = irq_timing_decode(irqts->values[i], &ts);
+
+ pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
+ i, ts, ots, irq, oirq);
+
+ if (WARN_ON(ts != ots || irq != oirq))
+ return -EINVAL;
+
+ ots++; oirq++;
+ }
+
+ /*
+ * The circular buffer should have be flushed when browsed
+ * with for_each_irqts
+ */
+ pr_debug("---> Checking timings array is empty after browsing it\n");
+ if (WARN_ON(irqts->count))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init irq_timings_irqts_selftest(void)
+{
+ struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+ int i, ret;
+
+ /*
+ * Test the circular buffer with different number of
+ * elements. The purpose is to test at the limits (empty, half
+ * full, full, wrapped with the cursor at the boundaries,
+ * wrapped several times, etc ...
+ */
+ int count[] = { 0,
+ IRQ_TIMINGS_SIZE >> 1,
+ IRQ_TIMINGS_SIZE,
+ IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
+ 2 * IRQ_TIMINGS_SIZE,
+ (2 * IRQ_TIMINGS_SIZE) + 3,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(count); i++) {
+
+ pr_info("---> Checking the timings with %d/%d values\n",
+ count[i], IRQ_TIMINGS_SIZE);
+
+ ret = irq_timings_test_irqts(irqts, count[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_selftest(void)
+{
+ int ret;
+
+ pr_info("------------------- selftest start -----------------\n");
+
+ /*
+ * At this point, we don't except any subsystem to use the irq
+ * timings but us, so it should not be enabled.
+ */
+ if (static_branch_unlikely(&irq_timing_enabled)) {
+ pr_warn("irq timings already initialized, skipping selftest\n");
+ return 0;
+ }
+
+ ret = irq_timings_irqts_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_irqs_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_next_index_selftest();
+out:
+ pr_info("---------- selftest end with %s -----------\n",
+ ret ? "failure" : "success");
+
+ return ret;
+}
+early_initcall(irq_timings_selftest);
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 73288914ed5e..d42acaf81886 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index de6efdecc70d..df3008419a1d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* jump label support
*
@@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b)
const struct jump_entry *jea = a;
const struct jump_entry *jeb = b;
+ /*
+ * Entrires are sorted by key.
+ */
if (jump_entry_key(jea) < jump_entry_key(jeb))
return -1;
if (jump_entry_key(jea) > jump_entry_key(jeb))
return 1;
+ /*
+ * In the batching mode, entries should also be sorted by the code
+ * inside the already sorted list of entries, enabling a bsearch in
+ * the vector.
+ */
+ if (jump_entry_code(jea) < jump_entry_code(jeb))
+ return -1;
+
+ if (jump_entry_code(jea) > jump_entry_code(jeb))
+ return 1;
+
return 0;
}
@@ -383,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry)
return enabled ^ branch;
}
+static bool jump_label_can_update(struct jump_entry *entry, bool init)
+{
+ /*
+ * Cannot update code that was in an init text area.
+ */
+ if (!init && jump_entry_is_init(entry))
+ return false;
+
+ if (!kernel_text_address(jump_entry_code(entry))) {
+ WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+ return false;
+ }
+
+ return true;
+}
+
+#ifndef HAVE_JUMP_LABEL_BATCH
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
struct jump_entry *stop,
bool init)
{
for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
- /*
- * An entry->code of 0 indicates an entry which has been
- * disabled because it was in an init text area.
- */
- if (init || !jump_entry_is_init(entry)) {
- if (kernel_text_address(jump_entry_code(entry)))
- arch_jump_label_transform(entry, jump_label_type(entry));
- else
- WARN_ONCE(1, "can't patch jump_label at %pS",
- (void *)jump_entry_code(entry));
+ if (jump_label_can_update(entry, init))
+ arch_jump_label_transform(entry, jump_label_type(entry));
+ }
+}
+#else
+static void __jump_label_update(struct static_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop,
+ bool init)
+{
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
+
+ if (!jump_label_can_update(entry, init))
+ continue;
+
+ if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
+ /*
+ * Queue is full: Apply the current queue and try again.
+ */
+ arch_jump_label_transform_apply();
+ BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
}
}
+ arch_jump_label_transform_apply();
}
+#endif
void __init jump_label_init(void)
{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 14934afa9e68..95a260f9214b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 68559808fdfa..1b018f1a6e0d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fd5c95ff9251..d5870723b8ad 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec.c - kexec system call core code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 072b6ee55e3f..ef7b951a8087 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec: kexec_file_load system call
*
* Copyright (C) 2014 Red Hat Inc.
* Authors:
* Vivek Goyal <vgoyal@redhat.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 70ae6052920d..8f69772af77b 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -8,9 +8,8 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/proc_fs.h>
+#include <linux/kobject.h>
#include <linux/init.h>
-#include <linux/uaccess.h>
/*
* Define kernel_headers_data and kernel_headers_data_end, within which the
@@ -31,39 +30,32 @@ extern char kernel_headers_data;
extern char kernel_headers_data_end;
static ssize_t
-ikheaders_read_current(struct file *file, char __user *buf,
- size_t len, loff_t *offset)
+ikheaders_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
{
- return simple_read_from_buffer(buf, len, offset,
- &kernel_headers_data,
- &kernel_headers_data_end -
- &kernel_headers_data);
+ memcpy(buf, &kernel_headers_data + off, len);
+ return len;
}
-static const struct file_operations ikheaders_file_ops = {
- .read = ikheaders_read_current,
- .llseek = default_llseek,
+static struct bin_attribute kheaders_attr __ro_after_init = {
+ .attr = {
+ .name = "kheaders.tar.xz",
+ .mode = 0444,
+ },
+ .read = &ikheaders_read,
};
static int __init ikheaders_init(void)
{
- struct proc_dir_entry *entry;
-
- /* create the current headers file */
- entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL,
- &ikheaders_file_ops);
- if (!entry)
- return -ENOMEM;
-
- proc_set_size(entry,
- &kernel_headers_data_end -
- &kernel_headers_data);
- return 0;
+ kheaders_attr.size = (&kernel_headers_data_end -
+ &kernel_headers_data);
+ return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
}
static void __exit ikheaders_cleanup(void)
{
- remove_proc_entry("kheaders.tar.xz", NULL);
+ sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
}
module_init(ikheaders_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b1ea30a5540e..445337c107e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
* kernel/kprobes.c
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..35859da8bd4f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
- *
- * This file is release under the GPLv2
- *
*/
#include <linux/kobject.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index be4e8795561a..621467c33fef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 871734ea2f04..e3acead004e6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
/*
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index ec4565122e65..54102deb50ba 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAVE_LIVEPATCH
bool
help
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index b36ceda6488e..cf9b5bcdb952 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_LIVEPATCH) += livepatch.o
livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 91cd519756d3..c4ce08f43bd6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* core.c - Kernel Live Patching Core
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,6 +18,7 @@
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/completion.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -730,16 +719,21 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
+ mutex_lock(&text_mutex);
+
module_disable_ro(patch->mod);
ret = klp_write_object_relocations(patch->mod, obj);
if (ret) {
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
return ret;
}
arch_klp_init_object_loaded(patch, obj);
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
+
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 99cb3ad05eb4..bd43537702bd 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* patch.c - livepatch patching functions
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
* Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 83958c814439..e5c9fb295ba9 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* shadow.c - Shadow Variables
*
* Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index c53370d596be..abb2a4a2cbb2 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* transition.c - Kernel Live Patching transition functions
*
* Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6fe2f333aecb..45452facff3b 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index feb1acc54611..8c7e7d25f09c 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -31,12 +31,13 @@ enum lock_events {
DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
/*
- * Increment the PV qspinlock statistical counters
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
*/
static inline void __lockevent_inc(enum lock_events event, bool cond)
{
if (cond)
- __this_cpu_inc(lockevents[event]);
+ raw_cpu_inc(lockevents[event]);
}
#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
@@ -44,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond)
static inline void __lockevent_add(enum lock_events event, int inc)
{
- __this_cpu_add(lockevents[event], inc);
+ raw_cpu_add(lockevents[event], inc);
}
#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index ad7668cfc9da..239039d0ce21 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,12 +56,16 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
-LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */
-LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */
+LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */
+LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
+LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
+LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */
+LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */
LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
-LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */
+LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
-LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */
+LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d06190fa5082..341f52117f88 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/lockdep.c
*
@@ -150,17 +151,28 @@ unsigned long nr_lock_classes;
static
#endif
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
- if (!hlock->class_idx) {
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+ barrier();
+
+ if (!test_bit(class_idx, lock_classes_in_use)) {
/*
* Someone passed in garbage, we give up.
*/
DEBUG_LOCKS_WARN_ON(1);
return NULL;
}
- return lock_classes + hlock->class_idx - 1;
+
+ /*
+ * At this point, if the passed hlock->class_idx is still garbage,
+ * we just have to live with it
+ */
+ return lock_classes + class_idx;
}
#ifdef CONFIG_LOCK_STAT
@@ -358,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
return k0 | (u64)k1 << 32;
}
+void lockdep_init_task(struct task_struct *task)
+{
+ task->lockdep_depth = 0; /* no locks held yet */
+ task->curr_chain_key = INITIAL_CHAIN_KEY;
+ task->lockdep_recursion = 0;
+}
+
void lockdep_off(void)
{
current->lockdep_recursion++;
@@ -418,13 +437,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -434,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
+unsigned long nr_stack_trace_entries;
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
static int save_trace(struct lock_trace *trace)
{
unsigned long *entries = stack_trace + nr_stack_trace_entries;
@@ -456,6 +477,7 @@ static int save_trace(struct lock_trace *trace)
return 1;
}
+#endif
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
@@ -469,6 +491,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Locking printouts:
*/
@@ -486,6 +509,7 @@ static const char *usage_str[] =
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
};
+#endif
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
@@ -499,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
+ /*
+ * The usage character defaults to '.' (i.e., irqs disabled and not in
+ * irq context), which is the safest usage category.
+ */
char c = '.';
- if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
+ /*
+ * The order of the following usage checks matters, which will
+ * result in the outcome character as follows:
+ *
+ * - '+': irq is enabled and not in irq context
+ * - '-': in irq context and irq is disabled
+ * - '?': in irq context and irq is enabled
+ */
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
c = '+';
- if (class->usage_mask & lock_flag(bit)) {
- c = '-';
- if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
+ if (class->usage_mask & lock_flag(bit))
c = '?';
- }
+ } else if (class->usage_mask & lock_flag(bit))
+ c = '-';
return c;
}
@@ -571,19 +606,22 @@ static void print_lock(struct held_lock *hlock)
/*
* We can be called locklessly through debug_show_all_locks() so be
* extra careful, the hlock might have been released and cleared.
+ *
+ * If this indeed happens, lets pretend it does not hurt to continue
+ * to print the lock unless the hlock class_idx does not point to a
+ * registered class. The rationale here is: since we don't attempt
+ * to distinguish whether we are in this situation, if it just
+ * happened we can't count on class_idx to tell either.
*/
- unsigned int class_idx = hlock->class_idx;
-
- /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
- barrier();
+ struct lock_class *lock = hlock_class(hlock);
- if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ if (!lock) {
printk(KERN_CONT "<RELEASED>\n");
return;
}
printk(KERN_CONT "%p", hlock->instance);
- print_lock_name(lock_classes + class_idx - 1);
+ print_lock_name(lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -731,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name);
+ WARN_ON_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__);
return class;
}
}
@@ -837,11 +876,11 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
static bool check_lock_chain_key(struct lock_chain *chain)
{
#ifdef CONFIG_PROVE_LOCKING
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int i;
for (i = chain->base; i < chain->base + chain->depth; i++)
- chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
/*
* The 'unsigned long long' casts avoid that a compiler warning
* is reported when building tools/lib/lockdep.
@@ -1116,6 +1155,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
return NULL;
}
nr_lock_classes++;
+ __set_bit(class - lock_classes, lock_classes_in_use);
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
@@ -1227,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this,
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
- * The circular_queue and helpers is used to implement the
- * breadth-first search(BFS)algorithem, by which we can build
- * the shortest path from the next lock to be acquired to the
- * previous held lock if there is a circular between them.
+ * The circular_queue and helpers are used to implement graph
+ * breadth-first search (BFS) algorithm, by which we can determine
+ * whether there is a path from a lock to another. In deadlock checks,
+ * a path from the next lock to be acquired to a previous held lock
+ * indicates that adding the <prev> -> <next> lock dependency will
+ * produce a circle in the graph. Breadth-first search instead of
+ * depth-first search is used in order to find the shortest (circular)
+ * path.
*/
struct circular_queue {
- unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
unsigned int front, rear;
};
@@ -1259,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq)
return ((cq->rear + 1) & CQ_MASK) == cq->front;
}
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
{
if (__cq_full(cq))
return -1;
@@ -1269,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
return 0;
}
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+/*
+ * Dequeue an element from the circular_queue, return a lock_list if
+ * the queue is not empty, or NULL if otherwise.
+ */
+static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
{
+ struct lock_list * lock;
+
if (__cq_empty(cq))
- return -1;
+ return NULL;
- *elem = cq->element[cq->front];
+ lock = cq->element[cq->front];
cq->front = (cq->front + 1) & CQ_MASK;
- return 0;
+
+ return lock;
}
static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
@@ -1321,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child)
return depth;
}
+/*
+ * Return the forward or backward dependency list.
+ *
+ * @lock: the lock_list to get its class's dependency list
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ */
+static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
+{
+ void *lock_class = lock->class;
+
+ return lock_class + offset;
+}
+
+/*
+ * Forward- or backward-dependency search, used for both circular dependency
+ * checking and hardirq-unsafe/softirq-unsafe checking.
+ */
static int __bfs(struct lock_list *source_entry,
void *data,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry,
- int forward)
+ int offset)
{
struct lock_list *entry;
+ struct lock_list *lock;
struct list_head *head;
struct circular_queue *cq = &lock_cq;
int ret = 1;
@@ -1338,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (forward)
- head = &source_entry->class->locks_after;
- else
- head = &source_entry->class->locks_before;
-
+ head = get_dep_list(source_entry, offset);
if (list_empty(head))
goto exit;
__cq_init(cq);
- __cq_enqueue(cq, (unsigned long)source_entry);
+ __cq_enqueue(cq, source_entry);
- while (!__cq_empty(cq)) {
- struct lock_list *lock;
-
- __cq_dequeue(cq, (unsigned long *)&lock);
+ while ((lock = __cq_dequeue(cq))) {
if (!lock->class) {
ret = -2;
goto exit;
}
- if (forward)
- head = &lock->class->locks_after;
- else
- head = &lock->class->locks_before;
+ head = get_dep_list(lock, offset);
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1376,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (__cq_enqueue(cq, (unsigned long)entry)) {
+ if (__cq_enqueue(cq, entry)) {
ret = -1;
goto exit;
}
@@ -1395,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 1);
+ return __bfs(src_entry, data, match, target_entry,
+ offsetof(struct lock_class, locks_after));
}
@@ -1404,16 +1465,11 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 0);
+ return __bfs(src_entry, data, match, target_entry,
+ offsetof(struct lock_class, locks_before));
}
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- */
-
static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
{
unsigned long *entries = stack_trace + trace->offset;
@@ -1425,16 +1481,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
-static noinline int
+static noinline void
print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
- return 0;
+ return;
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
print_lock_trace(&target->trace, 6);
- return 0;
}
static void
@@ -1491,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src,
* When a circular dependency is detected, print the
* header first:
*/
-static noinline int
+static noinline void
print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1499,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct task_struct *curr = current;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("======================================================\n");
@@ -1517,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
-
- return 0;
}
static inline int class_equal(struct lock_list *entry, void *data)
@@ -1526,10 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data)
return entry->class == data;
}
-static noinline int print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt)
+static noinline void print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1537,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
if (!save_trace(&this->trace))
- return 0;
+ return;
depth = get_lock_depth(target);
@@ -1562,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
-static noinline int print_bfs_bug(int ret)
+static noinline void print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
- return 0;
+ return;
/*
* Breadth-first-search failed, graph got corrupted?
*/
WARN(1, "lockdep bfs error:%d\n", ret);
-
- return 0;
}
static int noop_count(struct lock_list *entry, void *data)
@@ -1639,36 +1688,95 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
}
/*
- * Prove that the dependency graph starting at <entry> can not
- * lead to <target>. Print an error and return 0 if it does.
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. Print an error and return 0 if it does.
*/
static noinline int
-check_noncircular(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+check_path(struct lock_class *target, struct lock_list *src_entry,
+ struct lock_list **target_entry)
{
- int result;
+ int ret;
+
+ ret = __bfs_forwards(src_entry, (void *)target, class_equal,
+ target_entry);
+
+ if (unlikely(ret < 0))
+ print_bfs_bug(ret);
+
+ return ret;
+}
+
+/*
+ * Prove that the dependency graph starting at <src> can not
+ * lead to <target>. If it can, there is a circle when adding
+ * <target> -> <src> dependency.
+ *
+ * Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct held_lock *src, struct held_lock *target,
+ struct lock_trace *trace)
+{
+ int ret;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list src_entry = {
+ .class = hlock_class(src),
+ .parent = NULL,
+ };
debug_atomic_inc(nr_cyclic_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(hlock_class(target), &src_entry, &target_entry);
- return result;
+ if (unlikely(!ret)) {
+ if (!trace->nr_entries) {
+ /*
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
+ */
+ save_trace(trace);
+ }
+
+ print_circular_bug(&src_entry, target_entry, src, target);
+ }
+
+ return ret;
}
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Print an error and return 2 if it does or 1 if it does not.
+ */
static noinline int
-check_redundant(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- int result;
+ int ret;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list src_entry = {
+ .class = hlock_class(src),
+ .parent = NULL,
+ };
debug_atomic_inc(nr_redundant_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(hlock_class(target), &src_entry, &target_entry);
- return result;
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ ret = 2;
+ } else if (ret < 0)
+ ret = 0;
+
+ return ret;
}
+#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_TRACE_IRQFLAGS
static inline int usage_accumulate(struct lock_list *entry, void *mask)
{
@@ -1765,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
- struct lock_list *root)
+ struct lock_list *root)
{
struct lock_list *entry = leaf;
int depth;
@@ -1787,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
entry = get_lock_parent(entry);
depth--;
} while (entry && (depth >= 0));
-
- return;
}
static void
@@ -1847,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_bad_irq_dependency(struct task_struct *curr,
struct lock_list *prev_root,
struct lock_list *next_root,
@@ -1860,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr,
const char *irqclass)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=====================================================\n");
@@ -1906,19 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
if (!save_trace(&next_root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static const char *state_names[] = {
@@ -2065,8 +2169,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
this.class = hlock_class(prev);
ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
usage_mask &= LOCKF_USED_IN_IRQ_ALL;
if (!usage_mask)
@@ -2082,8 +2188,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
that.class = hlock_class(next);
ret = find_usage_forwards(&that, forward_mask, &target_entry1);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
@@ -2095,8 +2203,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
backward_mask = original_mask(target_entry1->class->usage_mask);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (DEBUG_LOCKS_WARN_ON(ret == 1))
return 1;
@@ -2110,11 +2220,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
if (DEBUG_LOCKS_WARN_ON(ret == -1))
return 1;
- return print_bad_irq_dependency(curr, &this, &that,
- target_entry, target_entry1,
- prev, next,
- backward_bit, forward_bit,
- state_name(backward_bit));
+ print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ backward_bit, forward_bit,
+ state_name(backward_bit));
+
+ return 0;
}
static void inc_chains(void)
@@ -2142,11 +2254,10 @@ static inline void inc_chains(void)
nr_process_chains++;
}
-#endif
+#endif /* CONFIG_TRACE_IRQFLAGS */
static void
-print_deadlock_scenario(struct held_lock *nxt,
- struct held_lock *prv)
+print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
{
struct lock_class *next = hlock_class(nxt);
struct lock_class *prev = hlock_class(prv);
@@ -2164,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt,
printk(" May be due to missing lock nesting notation\n\n");
}
-static int
+static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("============================================\n");
@@ -2188,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -2201,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
*/
static int
-check_deadlock(struct task_struct *curr, struct held_lock *next,
- struct lockdep_map *next_instance, int read)
+check_deadlock(struct task_struct *curr, struct held_lock *next)
{
struct held_lock *prev;
struct held_lock *nest = NULL;
@@ -2221,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
* Allow read-after-read recursion of the same
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
- if ((read == 2) && prev->read)
+ if ((next->read == 2) && prev->read)
return 2;
/*
@@ -2231,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- return print_deadlock_bug(curr, prev, next);
+ print_deadlock_bug(curr, prev, next);
+ return 0;
}
return 1;
}
/*
* There was a chain-cache miss, and we are about to add a new dependency
- * to a previous lock. We recursively validate the following rules:
+ * to a previous lock. We validate the following rules:
*
* - would the adding of the <prev> -> <next> dependency create a
* circular dependency in the graph? [== circular deadlock]
@@ -2262,9 +2371,7 @@ static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next, int distance, struct lock_trace *trace)
{
- struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
- struct lock_list this;
int ret;
if (!hlock_class(prev)->key || !hlock_class(next)->key) {
@@ -2288,28 +2395,16 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
- * forward-recursing into the graph starting at <next>, and
- * checking whether we can reach <prev>.)
+ * a breadth-first search into the graph starting at <next>,
+ * and check whether we can reach <prev>.)
*
- * We are using global variables to control the recursion, to
- * keep the stackframe size of the recursive functions low:
+ * The search is limited by the size of the circular queue (i.e.,
+ * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
+ * in the graph whose neighbours are to be checked.
*/
- this.class = hlock_class(next);
- this.parent = NULL;
- ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret)) {
- if (!trace->nr_entries) {
- /*
- * If save_trace fails here, the printing might
- * trigger a WARN but because of the !nr_entries it
- * should not do bad things.
- */
- save_trace(trace);
- }
- return print_circular_bug(&this, target_entry, next, prev);
- }
- else if (unlikely(ret < 0))
- return print_bfs_bug(ret);
+ ret = check_noncircular(next, prev, trace);
+ if (unlikely(ret <= 0))
+ return 0;
if (!check_irq_usage(curr, prev, next))
return 0;
@@ -2340,19 +2435,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
+#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
- this.class = hlock_class(prev);
- this.parent = NULL;
- ret = check_redundant(&this, hlock_class(next), &target_entry);
- if (!ret) {
- debug_atomic_inc(nr_redundant);
- return 2;
- }
- if (ret < 0)
- return print_bfs_bug(ret);
-
+ ret = check_redundant(prev, next);
+ if (ret != 1)
+ return ret;
+#endif
if (!trace->nr_entries && !save_trace(trace))
return 0;
@@ -2504,12 +2594,13 @@ static void
print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
{
struct held_lock *hlock;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int depth = curr->lockdep_depth;
- int i;
+ int i = get_first_held_lock(curr, hlock_next);
- printk("depth: %u\n", depth + 1);
- for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ printk("depth: %u (irq_context %u)\n", depth - i + 1,
+ hlock_next->irq_context);
+ for (; i < depth; i++) {
hlock = curr->held_locks + i;
chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
@@ -2523,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int class_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+ chain_key = print_chain_key_iteration(class_id, chain_key);
print_lock_name(lock_classes + class_id);
printk("\n");
@@ -2580,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx - 1;
+ id = curr->held_locks[i].class_idx;
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -2663,7 +2754,7 @@ static inline int add_chain_cache(struct task_struct *curr,
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx - 1;
+ int lock_id = curr->held_locks[i].class_idx;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
@@ -2753,8 +2844,9 @@ cache_hit:
return 1;
}
-static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
- struct held_lock *hlock, int chain_head, u64 chain_key)
+static int validate_chain(struct task_struct *curr,
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
/*
* Trylock needs to maintain the stack of held locks, but it
@@ -2775,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* - is softirq-safe, if this lock is hardirq-unsafe
*
* And check whether the new lock's dependency graph
- * could lead back to the previous lock.
+ * could lead back to the previous lock:
*
- * any of these scenarios could lead to a deadlock. If
- * All validations
+ * - within the current held-lock stack
+ * - across our accumulated lock dependency records
+ *
+ * any of these scenarios could lead to a deadlock.
*/
- int ret = check_deadlock(curr, hlock, lock, hlock->read);
+ /*
+ * The simple case: does the current hold the same lock
+ * already?
+ */
+ int ret = check_deadlock(curr, hlock);
if (!ret)
return 0;
@@ -2811,16 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
}
#else
static inline int validate_chain(struct task_struct *curr,
- struct lockdep_map *lock, struct held_lock *hlock,
- int chain_head, u64 chain_key)
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
return 1;
}
-
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
-{
-}
-#endif
+#endif /* CONFIG_PROVE_LOCKING */
/*
* We are building curr_chain_key incrementally, so double-check
@@ -2831,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr)
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
unsigned int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -2847,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
+
/*
- * Whoops ran out of static storage again?
+ * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+ * it registered lock class index?
*/
- if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_key = iterate_chain_key(chain_key, hlock->class_idx);
prev_hlock = hlock;
}
@@ -2873,14 +2969,11 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
-
-
-static void
-print_usage_bug_scenario(struct held_lock *lock)
+static void print_usage_bug_scenario(struct held_lock *lock)
{
struct lock_class *class = hlock_class(lock);
@@ -2897,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock)
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("================================\n");
@@ -2932,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -2943,8 +3034,10 @@ static inline int
valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
- if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
- return print_usage_bug(curr, this, bad_bit, new_bit);
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ print_usage_bug(curr, this, bad_bit, new_bit);
+ return 0;
+ }
return 1;
}
@@ -2952,7 +3045,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
/*
* print irq inversion bug:
*/
-static int
+static void
print_irq_inversion_bug(struct task_struct *curr,
struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
@@ -2963,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("========================================================\n");
@@ -3004,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
if (!save_trace(&root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -3028,13 +3119,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
+ return 0;
}
/*
@@ -3052,13 +3146,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, irqclass);
+ return 0;
}
void print_irqtrace_events(struct task_struct *curr)
@@ -3141,7 +3238,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
* Validate that the lock dependencies don't have conflicting usage
* states.
*/
- if ((!read || !dir || STRICT_READ_CHECKS) &&
+ if ((!read || STRICT_READ_CHECKS) &&
!usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
return 0;
@@ -3366,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
+ if (!check)
+ goto lock_used;
+
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
@@ -3411,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
return 1;
}
@@ -3442,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
- return 0;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
@@ -3479,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
{
unsigned int new_mask = 1 << new_bit, ret = 1;
+ if (new_bit >= LOCK_USAGE_STATES) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return 0;
+ }
+
/*
* If already set then do not dirty the cacheline,
* nor do any checks:
@@ -3502,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
switch (new_bit) {
-#define LOCKDEP_STATE(__STATE) \
- case LOCK_USED_IN_##__STATE: \
- case LOCK_USED_IN_##__STATE##_READ: \
- case LOCK_ENABLED_##__STATE: \
- case LOCK_ENABLED_##__STATE##_READ:
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
- ret = mark_lock_irq(curr, this, new_bit);
- if (!ret)
- return 0;
- break;
case LOCK_USED:
debug_atomic_dec(nr_unused_locks);
break;
default:
- if (!debug_locks_off_graph_unlock())
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
return 0;
- WARN_ON(1);
- return 0;
}
graph_unlock();
@@ -3538,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
@@ -3601,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-static int
+static void
print_lock_nested_lock_not_held(struct task_struct *curr,
struct held_lock *hlock,
unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("==================================\n");
@@ -3631,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -3697,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
- class_idx = class - lock_classes + 1;
+ class_idx = class - lock_classes;
if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references) {
- /*
- * Check: unsigned int references:12, overflow.
- */
- if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
- return 0;
+ if (!references)
+ references++;
+ if (!hlock->references)
hlock->references++;
- } else {
- hlock->references = 2;
- }
- return 1;
+ hlock->references += references;
+
+ /* Overflow */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+ return 0;
+
+ return 2;
}
}
@@ -3741,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
- return 0;
-
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -3759,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* the hash, not class->key.
*/
/*
- * Whoops, we did it again.. ran straight out of our static allocation.
+ * Whoops, we did it again.. class_idx is invalid.
*/
- if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
return 0;
chain_key = curr->curr_chain_key;
@@ -3769,27 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
/*
* How can we have a chain hash when we ain't got no keys?!
*/
- if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
return 0;
chain_head = 1;
}
hlock->prev_chain_key = chain_key;
if (separate_irq_context(curr, hlock)) {
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
chain_key = iterate_chain_key(chain_key, class_idx);
- if (nest_lock && !__lock_is_held(nest_lock, -1))
- return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (nest_lock && !__lock_is_held(nest_lock, -1)) {
+ print_lock_nested_lock_not_held(curr, hlock, ip);
+ return 0;
+ }
if (!debug_locks_silent) {
WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
WARN_ON_ONCE(!hlock_class(hlock)->key);
}
- if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
curr->curr_chain_key = chain_key;
@@ -3818,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 1;
}
-static int
-print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_unlock_imbalance_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=====================================\n");
@@ -3843,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static int match_held_lock(const struct held_lock *hlock,
@@ -3876,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock,
if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
return 0;
- if (hlock->class_idx == class - lock_classes + 1)
+ if (hlock->class_idx == class - lock_classes)
return 1;
}
@@ -3920,22 +4006,33 @@ out:
}
static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
- int idx)
+ int idx, unsigned int *merged)
{
struct held_lock *hlock;
+ int first_idx = idx;
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
- if (!__lock_acquire(hlock->instance,
+ switch (__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass,
hlock->trylock,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
+ hlock->references, hlock->pin_count)) {
+ case 0:
return 1;
+ case 1:
+ break;
+ case 2:
+ *merged += (idx == first_idx);
+ break;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
}
return 0;
}
@@ -3946,9 +4043,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
struct lock_class *class;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -3963,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class - lock_classes;
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
return 0;
/*
* I took it apart and put it back together again, except now I have
* these 'spare' parts.. where shall I put them.
*/
- if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
return 0;
return 1;
}
@@ -3988,8 +4087,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -4004,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
@@ -4014,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
hlock->read = 1;
hlock->acquire_ip = ip;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
+ return 0;
+
+ /* Merging can't happen with unchanged classes.. */
+ if (DEBUG_LOCKS_WARN_ON(merged))
return 0;
/*
@@ -4023,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
return 0;
+
return 1;
}
@@ -4034,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
* @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+__lock_release(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 1;
struct held_lock *hlock;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -4049,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(depth <= 0))
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (depth <= 0) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
/*
* Check whether the lock exists in the current stack
* of held locks:
*/
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -4093,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
if (i == depth-1)
return 1;
- if (reacquire_held_locks(curr, depth, i + 1))
+ if (reacquire_held_locks(curr, depth, i + 1, &merged))
return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
* there's not N-1 bottles of beer left on the wall...
+ * Pouring two of the bottles together is acceptable.
*/
- DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1);
+ DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
/*
* Since reacquire_held_locks() would have called check_chain_key()
@@ -4318,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested,
check_flags(flags);
current->lockdep_recursion = 1;
trace_lock_release(lock, ip);
- if (__lock_release(lock, nested, ip))
+ if (__lock_release(lock, ip))
check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
@@ -4401,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
EXPORT_SYMBOL_GPL(lock_unpin_lock);
#ifdef CONFIG_LOCK_STAT
-static int
-print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_lock_contention_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=================================\n");
@@ -4426,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static void
@@ -4572,9 +4681,7 @@ void lockdep_reset(void)
int i;
raw_local_irq_save(flags);
- current->curr_chain_key = 0;
- current->lockdep_depth = 0;
- current->lockdep_recursion = 0;
+ lockdep_init_task(current);
memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
nr_hardirq_chains = 0;
nr_softirq_chains = 0;
@@ -4614,9 +4721,9 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
return;
recalc:
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
for (i = chain->base; i < chain->base + chain->depth; i++)
- chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
if (chain->depth && chain->chain_key == chain_key)
return;
/* Overwrite the chain key for concurrent RCU readers. */
@@ -4690,6 +4797,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->key, NULL);
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
+ __clear_bit(class - lock_classes, lock_classes_in_use);
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -5035,6 +5143,7 @@ void __init lockdep_init(void)
printk(" memory used by lock dependency info: %zu kB\n",
(sizeof(lock_classes) +
+ sizeof(lock_classes_in_use) +
sizeof(classhash_table) +
sizeof(list_entries) +
sizeof(list_entries_in_use) +
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 150ec3f0c5b5..cc83568d5012 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
-extern unsigned int max_recursion_depth;
extern unsigned int max_bfs_queue_depth;
@@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class)
* and we want to avoid too much cache bouncing.
*/
struct lockdep_stats {
- int chain_lookup_hits;
- int chain_lookup_misses;
- int hardirqs_on_events;
- int hardirqs_off_events;
- int redundant_hardirqs_on;
- int redundant_hardirqs_off;
- int softirqs_on_events;
- int softirqs_off_events;
- int redundant_softirqs_on;
- int redundant_softirqs_off;
- int nr_unused_locks;
- int nr_redundant_checks;
- int nr_redundant;
- int nr_cyclic_checks;
- int nr_cyclic_check_recursions;
- int nr_find_usage_forwards_checks;
- int nr_find_usage_forwards_recursions;
- int nr_find_usage_backwards_checks;
- int nr_find_usage_backwards_recursions;
+ unsigned long chain_lookup_hits;
+ unsigned int chain_lookup_misses;
+ unsigned long hardirqs_on_events;
+ unsigned long hardirqs_off_events;
+ unsigned long redundant_hardirqs_on;
+ unsigned long redundant_hardirqs_off;
+ unsigned long softirqs_on_events;
+ unsigned long softirqs_off_events;
+ unsigned long redundant_softirqs_on;
+ unsigned long redundant_softirqs_off;
+ int nr_unused_locks;
+ unsigned int nr_redundant_checks;
+ unsigned int nr_redundant;
+ unsigned int nr_cyclic_checks;
+ unsigned int nr_find_usage_forwards_checks;
+ unsigned int nr_find_usage_backwards_checks;
/*
* Per lock class locking operation stat counts
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..c513031cd7e3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -975,7 +975,7 @@ static int __init lock_torture_init(void)
goto unwind;
}
if (stutter > 0) {
- firsterr = torture_stutter_init(stutter);
+ firsterr = torture_stutter_init(stutter, stutter);
if (firsterr)
goto unwind;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index db578783dd36..0c601ae072b3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/locking/mutex.c
*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f17dad99eec8..364d38a0c444 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
@@ -17,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ rcu_sync_init(&sem->rss);
__init_rwsem(&sem->rw_sem, name, rwsem_key);
rcuwait_init(&sem->writer);
sem->readers_block = 0;
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index c7471c3fb798..fe9ca92faa2a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued read/write locks
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
*
* Authors: Waiman Long <waiman.long@hp.com>
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index e14b32c69639..2473f10c6956 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued spinlock
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
* (C) Copyright 2013-2014,2018 Red Hat, Inc.
* (C) Copyright 2015 Intel Corp.
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 54152670ff24..e625bb410aa2 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -1,13 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*
* Authors: Waiman Long <longman@redhat.com>
*/
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 978d63a8261c..38fbf9fa7f1b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RT-Mutexes: simple blocking mutual exclusion locks with PI support
*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
deleted file mode 100644
index 0b1f77957240..000000000000
--- a/kernel/locking/rwsem-xadd.c
+++ /dev/null
@@ -1,745 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem.c: R/W semaphores: contention handling functions
- *
- * Written by David Howells (dhowells@redhat.com).
- * Derived from arch/i386/kernel/semaphore.c
- *
- * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
- * and Michel Lespinasse <walken@google.com>
- *
- * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
- * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
- */
-#include <linux/rwsem.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/debug.h>
-#include <linux/osq_lock.h>
-
-#include "rwsem.h"
-
-/*
- * Guide to the rw_semaphore's count field for common values.
- * (32-bit case illustrated, similar for 64-bit)
- *
- * 0x0000000X (1) X readers active or attempting lock, no writer waiting
- * X = #active_readers + #readers attempting to lock
- * (X*ACTIVE_BIAS)
- *
- * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or
- * attempting to read lock or write lock.
- *
- * 0xffff000X (1) X readers active or attempting lock, with waiters for lock
- * X = #active readers + # readers attempting lock
- * (X*ACTIVE_BIAS + WAITING_BIAS)
- * (2) 1 writer attempting lock, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- * (3) 1 writer active, no waiters for lock
- * X-1 = #active readers + #readers attempting lock
- * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS)
- *
- * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock
- * (WAITING_BIAS + ACTIVE_BIAS)
- * (2) 1 writer active or attempting lock, no waiters for lock
- * (ACTIVE_WRITE_BIAS)
- *
- * 0xffff0000 (1) There are writers or readers queued but none active
- * or in the process of attempting lock.
- * (WAITING_BIAS)
- * Note: writer can attempt to steal lock for this count by adding
- * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count
- *
- * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue.
- * (ACTIVE_WRITE_BIAS + WAITING_BIAS)
- *
- * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking
- * the count becomes more than 0 for successful lock acquisition,
- * i.e. the case where there are only readers or nobody has lock.
- * (1st and 2nd case above).
- *
- * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and
- * checking the count becomes ACTIVE_WRITE_BIAS for successful lock
- * acquisition (i.e. nobody else has lock or attempts lock). If
- * unsuccessful, in rwsem_down_write_failed, we'll check to see if there
- * are only waiters but none active (5th case above), and attempt to
- * steal the lock.
- *
- */
-
-/*
- * Initialize an rwsem:
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
- struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- /*
- * Make sure we are not reinitializing a held semaphore:
- */
- debug_check_no_locks_freed((void *)sem, sizeof(*sem));
- lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
- atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
- raw_spin_lock_init(&sem->wait_lock);
- INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
- sem->owner = NULL;
- osq_lock_init(&sem->osq);
-#endif
-}
-
-EXPORT_SYMBOL(__init_rwsem);
-
-enum rwsem_waiter_type {
- RWSEM_WAITING_FOR_WRITE,
- RWSEM_WAITING_FOR_READ
-};
-
-struct rwsem_waiter {
- struct list_head list;
- struct task_struct *task;
- enum rwsem_waiter_type type;
-};
-
-enum rwsem_wake_type {
- RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
- RWSEM_WAKE_READERS, /* Wake readers only */
- RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
-};
-
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here from up_xxxx(), then:
- * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
- * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
- * - there must be someone on the queue
- * - the wait_lock must be held by the caller
- * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
- * to actually wakeup the blocked task(s) and drop the reference count,
- * preferably when the wait_lock is released
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only marked woken if downgrading is false
- */
-static void __rwsem_mark_wake(struct rw_semaphore *sem,
- enum rwsem_wake_type wake_type,
- struct wake_q_head *wake_q)
-{
- struct rwsem_waiter *waiter, *tmp;
- long oldcount, woken = 0, adjustment = 0;
- struct list_head wlist;
-
- /*
- * Take a peek at the queue head waiter such that we can determine
- * the wakeup(s) to perform.
- */
- waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
-
- if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY) {
- /*
- * Mark writer at the front of the queue for wakeup.
- * Until the task is actually later awoken later by
- * the caller, other writers are able to steal it.
- * Readers, on the other hand, will block as they
- * will notice the queued writer.
- */
- wake_q_add(wake_q, waiter->task);
- lockevent_inc(rwsem_wake_writer);
- }
-
- return;
- }
-
- /*
- * Writers might steal the lock before we grant it to the next reader.
- * We prefer to do the first reader grant before counting readers
- * so we can bail out early if a writer stole the lock.
- */
- if (wake_type != RWSEM_WAKE_READ_OWNED) {
- adjustment = RWSEM_ACTIVE_READ_BIAS;
- try_reader_grant:
- oldcount = atomic_long_fetch_add(adjustment, &sem->count);
- if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /*
- * If the count is still less than RWSEM_WAITING_BIAS
- * after removing the adjustment, it is assumed that
- * a writer has stolen the lock. We have to undo our
- * reader grant.
- */
- if (atomic_long_add_return(-adjustment, &sem->count) <
- RWSEM_WAITING_BIAS)
- return;
-
- /* Last active locker left. Retry waking readers. */
- goto try_reader_grant;
- }
- /*
- * Set it to reader-owned to give spinners an early
- * indication that readers now have the lock.
- */
- __rwsem_set_reader_owned(sem, waiter->task);
- }
-
- /*
- * Grant an infinite number of read locks to the readers at the front
- * of the queue. We know that woken will be at least 1 as we accounted
- * for above. Note we increment the 'active part' of the count by the
- * number of readers before waking any processes up.
- *
- * We have to do wakeup in 2 passes to prevent the possibility that
- * the reader count may be decremented before it is incremented. It
- * is because the to-be-woken waiter may not have slept yet. So it
- * may see waiter->task got cleared, finish its critical section and
- * do an unlock before the reader count increment.
- *
- * 1) Collect the read-waiters in a separate list, count them and
- * fully increment the reader count in rwsem.
- * 2) For each waiters in the new list, clear waiter->task and
- * put them into wake_q to be woken up later.
- */
- list_for_each_entry(waiter, &sem->wait_list, list) {
- if (waiter->type == RWSEM_WAITING_FOR_WRITE)
- break;
-
- woken++;
- }
- list_cut_before(&wlist, &sem->wait_list, &waiter->list);
-
- adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
- lockevent_cond_inc(rwsem_wake_reader, woken);
- if (list_empty(&sem->wait_list)) {
- /* hit end of list above */
- adjustment -= RWSEM_WAITING_BIAS;
- }
-
- if (adjustment)
- atomic_long_add(adjustment, &sem->count);
-
- /* 2nd pass */
- list_for_each_entry_safe(waiter, tmp, &wlist, list) {
- struct task_struct *tsk;
-
- tsk = waiter->task;
- get_task_struct(tsk);
-
- /*
- * Ensure calling get_task_struct() before setting the reader
- * waiter to nil such that rwsem_down_read_failed() cannot
- * race with do_exit() by always holding a reference count
- * to the task to wakeup.
- */
- smp_store_release(&waiter->task, NULL);
- /*
- * Ensure issuing the wakeup (either by us or someone else)
- * after setting the reader waiter to nil.
- */
- wake_q_add_safe(wake_q, tsk);
- }
-}
-
-/*
- * This function must be called with the sem->wait_lock held to prevent
- * race conditions between checking the rwsem wait list and setting the
- * sem->count accordingly.
- */
-static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
-{
- /*
- * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
- */
- if (count != RWSEM_WAITING_BIAS)
- return false;
-
- /*
- * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
- * are other tasks on the wait list, we need to add on WAITING_BIAS.
- */
- count = list_is_singular(&sem->wait_list) ?
- RWSEM_ACTIVE_WRITE_BIAS :
- RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
-
- if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
- == RWSEM_WAITING_BIAS) {
- rwsem_set_owner(sem);
- return true;
- }
-
- return false;
-}
-
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * Try to acquire write lock before the writer has been put on wait queue.
- */
-static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
-{
- long count = atomic_long_read(&sem->count);
-
- while (!count || count == RWSEM_WAITING_BIAS) {
- if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
- count + RWSEM_ACTIVE_WRITE_BIAS)) {
- rwsem_set_owner(sem);
- lockevent_inc(rwsem_opt_wlock);
- return true;
- }
- }
- return false;
-}
-
-static inline bool owner_on_cpu(struct task_struct *owner)
-{
- /*
- * As lock holder preemption issue, we both skip spinning if
- * task is not on cpu or its cpu is preempted
- */
- return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
-}
-
-static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner;
- bool ret = true;
-
- BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN));
-
- if (need_resched())
- return false;
-
- rcu_read_lock();
- owner = READ_ONCE(sem->owner);
- if (owner) {
- ret = is_rwsem_owner_spinnable(owner) &&
- owner_on_cpu(owner);
- }
- rcu_read_unlock();
- return ret;
-}
-
-/*
- * Return true only if we can still spin on the owner field of the rwsem.
- */
-static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
-{
- struct task_struct *owner = READ_ONCE(sem->owner);
-
- if (!is_rwsem_owner_spinnable(owner))
- return false;
-
- rcu_read_lock();
- while (owner && (READ_ONCE(sem->owner) == owner)) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking sem->owner still matches owner, if that fails,
- * owner might point to free()d memory, if it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- /*
- * abort spinning when need_resched or owner is not running or
- * owner's cpu is preempted.
- */
- if (need_resched() || !owner_on_cpu(owner)) {
- rcu_read_unlock();
- return false;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * If there is a new owner or the owner is not set, we continue
- * spinning.
- */
- return is_rwsem_owner_spinnable(READ_ONCE(sem->owner));
-}
-
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- bool taken = false;
-
- preempt_disable();
-
- /* sem->wait_lock should not be held when doing optimistic spinning */
- if (!rwsem_can_spin_on_owner(sem))
- goto done;
-
- if (!osq_lock(&sem->osq))
- goto done;
-
- /*
- * Optimistically spin on the owner field and attempt to acquire the
- * lock whenever the owner changes. Spinning will be stopped when:
- * 1) the owning writer isn't running; or
- * 2) readers own the lock as we can't determine if they are
- * actively running or not.
- */
- while (rwsem_spin_on_owner(sem)) {
- /*
- * Try to acquire the lock
- */
- if (rwsem_try_write_lock_unqueued(sem)) {
- taken = true;
- break;
- }
-
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!sem->owner && (need_resched() || rt_task(current)))
- break;
-
- /*
- * The cpu_relax() call is a compiler barrier which forces
- * everything in this loop to be re-loaded. We don't need
- * memory barriers as we'll eventually observe the right
- * values at the cost of a few extra spins.
- */
- cpu_relax();
- }
- osq_unlock(&sem->osq);
-done:
- preempt_enable();
- lockevent_cond_inc(rwsem_opt_fail, !taken);
- return taken;
-}
-
-/*
- * Return true if the rwsem has active spinner
- */
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
- return osq_is_locked(&sem->osq);
-}
-
-#else
-static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
-{
- return false;
-}
-
-static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
-{
- return false;
-}
-#endif
-
-/*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
- long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
- struct rwsem_waiter waiter;
- DEFINE_WAKE_Q(wake_q);
-
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_READ;
-
- raw_spin_lock_irq(&sem->wait_lock);
- if (list_empty(&sem->wait_list)) {
- /*
- * In case the wait queue is empty and the lock isn't owned
- * by a writer, this reader can exit the slowpath and return
- * immediately as its RWSEM_ACTIVE_READ_BIAS has already
- * been set in the count.
- */
- if (atomic_long_read(&sem->count) >= 0) {
- raw_spin_unlock_irq(&sem->wait_lock);
- rwsem_set_reader_owned(sem);
- lockevent_inc(rwsem_rlock_fast);
- return sem;
- }
- adjustment += RWSEM_WAITING_BIAS;
- }
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- count = atomic_long_add_return(adjustment, &sem->count);
-
- /*
- * If there are no active locks, wake the front queued process(es).
- *
- * If there are no writers and we are first in the queue,
- * wake our own waiter to join the existing active readers !
- */
- if (count == RWSEM_WAITING_BIAS ||
- (count > RWSEM_WAITING_BIAS &&
- adjustment != -RWSEM_ACTIVE_READ_BIAS))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
-
- /* wait to be given the lock */
- while (true) {
- set_current_state(state);
- if (!waiter.task)
- break;
- if (signal_pending_state(state, current)) {
- raw_spin_lock_irq(&sem->wait_lock);
- if (waiter.task)
- goto out_nolock;
- raw_spin_unlock_irq(&sem->wait_lock);
- break;
- }
- schedule();
- lockevent_inc(rwsem_sleep_reader);
- }
-
- __set_current_state(TASK_RUNNING);
- lockevent_inc(rwsem_rlock);
- return sem;
-out_nolock:
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list))
- atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
- raw_spin_unlock_irq(&sem->wait_lock);
- __set_current_state(TASK_RUNNING);
- lockevent_inc(rwsem_rlock_fail);
- return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
- return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
- return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-
-/*
- * Wait until we successfully acquire the write lock
- */
-static inline struct rw_semaphore *
-__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
-{
- long count;
- bool waiting = true; /* any queued threads before us */
- struct rwsem_waiter waiter;
- struct rw_semaphore *ret = sem;
- DEFINE_WAKE_Q(wake_q);
-
- /* undo write bias from down_write operation, stop active locking */
- count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
-
- /* do optimistic spinning and steal lock if possible */
- if (rwsem_optimistic_spin(sem))
- return sem;
-
- /*
- * Optimistic spinning failed, proceed to the slowpath
- * and block until we can acquire the sem.
- */
- waiter.task = current;
- waiter.type = RWSEM_WAITING_FOR_WRITE;
-
- raw_spin_lock_irq(&sem->wait_lock);
-
- /* account for this before adding a new element to the list */
- if (list_empty(&sem->wait_list))
- waiting = false;
-
- list_add_tail(&waiter.list, &sem->wait_list);
-
- /* we're now waiting on the lock, but no longer actively locking */
- if (waiting) {
- count = atomic_long_read(&sem->count);
-
- /*
- * If there were already threads queued before us and there are
- * no active writers, the lock must be read owned; so we try to
- * wake any read locks that were queued ahead of us.
- */
- if (count > RWSEM_WAITING_BIAS) {
- __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
- /*
- * The wakeup is normally called _after_ the wait_lock
- * is released, but given that we are proactively waking
- * readers we can deal with the wake_q overhead as it is
- * similar to releasing and taking the wait_lock again
- * for attempting rwsem_try_write_lock().
- */
- wake_up_q(&wake_q);
-
- /*
- * Reinitialize wake_q after use.
- */
- wake_q_init(&wake_q);
- }
-
- } else
- count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
-
- /* wait until we successfully acquire the lock */
- set_current_state(state);
- while (true) {
- if (rwsem_try_write_lock(count, sem))
- break;
- raw_spin_unlock_irq(&sem->wait_lock);
-
- /* Block until there are no active lockers. */
- do {
- if (signal_pending_state(state, current))
- goto out_nolock;
-
- schedule();
- lockevent_inc(rwsem_sleep_writer);
- set_current_state(state);
- } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
-
- raw_spin_lock_irq(&sem->wait_lock);
- }
- __set_current_state(TASK_RUNNING);
- list_del(&waiter.list);
- raw_spin_unlock_irq(&sem->wait_lock);
- lockevent_inc(rwsem_wlock);
-
- return ret;
-
-out_nolock:
- __set_current_state(TASK_RUNNING);
- raw_spin_lock_irq(&sem->wait_lock);
- list_del(&waiter.list);
- if (list_empty(&sem->wait_list))
- atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
- else
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
- raw_spin_unlock_irq(&sem->wait_lock);
- wake_up_q(&wake_q);
- lockevent_inc(rwsem_wlock_fail);
-
- return ERR_PTR(-EINTR);
-}
-
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed(struct rw_semaphore *sem)
-{
- return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed);
-
-__visible struct rw_semaphore * __sched
-rwsem_down_write_failed_killable(struct rw_semaphore *sem)
-{
- return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_write_failed_killable);
-
-/*
- * handle waking up a waiter on the semaphore
- * - up_read/up_write has decremented the active part of count if we come here
- */
-__visible
-struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
- DEFINE_WAKE_Q(wake_q);
-
- /*
- * __rwsem_down_write_failed_common(sem)
- * rwsem_optimistic_spin(sem)
- * osq_unlock(sem->osq)
- * ...
- * atomic_long_add_return(&sem->count)
- *
- * - VS -
- *
- * __up_write()
- * if (atomic_long_sub_return_release(&sem->count) < 0)
- * rwsem_wake(sem)
- * osq_is_locked(&sem->osq)
- *
- * And __up_write() must observe !osq_is_locked() when it observes the
- * atomic_long_add_return() in order to not miss a wakeup.
- *
- * This boils down to:
- *
- * [S.rel] X = 1 [RmW] r0 = (Y += 0)
- * MB RMB
- * [RmW] Y += 1 [L] r1 = X
- *
- * exists (r0=1 /\ r1=0)
- */
- smp_rmb();
-
- /*
- * If a spinner is present, it is not necessary to do the wakeup.
- * Try to do wakeup only if the trylock succeeds to minimize
- * spinlock contention which may introduce too much delay in the
- * unlock operation.
- *
- * spinning writer up_write/up_read caller
- * --------------- -----------------------
- * [S] osq_unlock() [L] osq
- * MB RMB
- * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
- *
- * Here, it is important to make sure that there won't be a missed
- * wakeup while the rwsem is free and the only spinning writer goes
- * to sleep without taking the rwsem. Even when the spinning writer
- * is just going to break out of the waiting loop, it will still do
- * a trylock in rwsem_down_write_failed() before sleeping. IOW, if
- * rwsem_has_spinner() is true, it will guarantee at least one
- * trylock attempt on the rwsem later on.
- */
- if (rwsem_has_spinner(sem)) {
- /*
- * The smp_rmb() here is to make sure that the spinner
- * state is consulted before reading the wait_lock.
- */
- smp_rmb();
- if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
- return sem;
- goto locked;
- }
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-locked:
-
- if (!list_empty(&sem->wait_list))
- __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- wake_up_q(&wake_q);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_wake);
-
-/*
- * downgrade a write lock into a read lock
- * - caller incremented waiting part of count and discovered it still negative
- * - just wake up any readers at the front of the queue
- */
-__visible
-struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
-{
- unsigned long flags;
- DEFINE_WAKE_Q(wake_q);
-
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
-
- if (!list_empty(&sem->wait_list))
- __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
-
- raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- wake_up_q(&wake_q);
-
- return sem;
-}
-EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index ccbf18f560ff..37524a47f002 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -3,17 +3,1438 @@
*
* Written by David Howells (dhowells@redhat.com).
* Derived from asm-i386/semaphore.h
+ *
+ * Writer lock-stealing by Alex Shi <alex.shi@intel.com>
+ * and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
+ *
+ * Rwsem count bit fields re-definition and rwsem rearchitecture by
+ * Waiman Long <longman@redhat.com> and
+ * Peter Zijlstra <peterz@infradead.org>.
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
#include <linux/sched/debug.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/clock.h>
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include "rwsem.h"
+#include "lock_events.h"
+
+/*
+ * The least significant 3 bits of the owner value has the following
+ * meanings when set.
+ * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers
+ * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock.
+ * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock.
+ *
+ * When the rwsem is either owned by an anonymous writer, or it is
+ * reader-owned, but a spinning writer has timed out, both nonspinnable
+ * bits will be set to disable optimistic spinning by readers and writers.
+ * In the later case, the last unlocking reader should then check the
+ * writer nonspinnable bit and clear it only to give writers preference
+ * to acquire the lock via optimistic spinning, but not readers. Similar
+ * action is also done in the reader slowpath.
+
+ * When a writer acquires a rwsem, it puts its task_struct pointer
+ * into the owner field. It is cleared after an unlock.
+ *
+ * When a reader acquires a rwsem, it will also puts its task_struct
+ * pointer into the owner field with the RWSEM_READER_OWNED bit set.
+ * On unlock, the owner field will largely be left untouched. So
+ * for a free or reader-owned rwsem, the owner value may contain
+ * information about the last reader that acquires the rwsem.
+ *
+ * That information may be helpful in debugging cases where the system
+ * seems to hang on a reader owned rwsem especially if only one reader
+ * is involved. Ideally we would like to track all the readers that own
+ * a rwsem, but the overhead is simply too big.
+ *
+ * Reader optimistic spinning is helpful when the reader critical section
+ * is short and there aren't that many readers around. It makes readers
+ * relatively more preferred than writers. When a writer times out spinning
+ * on a reader-owned lock and set the nospinnable bits, there are two main
+ * reasons for that.
+ *
+ * 1) The reader critical section is long, perhaps the task sleeps after
+ * acquiring the read lock.
+ * 2) There are just too many readers contending the lock causing it to
+ * take a while to service all of them.
+ *
+ * In the former case, long reader critical section will impede the progress
+ * of writers which is usually more important for system performance. In
+ * the later case, reader optimistic spinning tends to make the reader
+ * groups that contain readers that acquire the lock together smaller
+ * leading to more of them. That may hurt performance in some cases. In
+ * other words, the setting of nonspinnable bits indicates that reader
+ * optimistic spinning may not be helpful for those workloads that cause
+ * it.
+ *
+ * Therefore, any writers that had observed the setting of the writer
+ * nonspinnable bit for a given rwsem after they fail to acquire the lock
+ * via optimistic spinning will set the reader nonspinnable bit once they
+ * acquire the write lock. Similarly, readers that observe the setting
+ * of reader nonspinnable bit at slowpath entry will set the reader
+ * nonspinnable bits when they acquire the read lock via the wakeup path.
+ *
+ * Once the reader nonspinnable bit is on, it will only be reset when
+ * a writer is able to acquire the rwsem in the fast path or somehow a
+ * reader or writer in the slowpath doesn't observe the nonspinable bit.
+ *
+ * This is to discourage reader optmistic spinning on that particular
+ * rwsem and make writers more preferred. This adaptive disabling of reader
+ * optimistic spinning will alleviate the negative side effect of this
+ * feature.
+ */
+#define RWSEM_READER_OWNED (1UL << 0)
+#define RWSEM_RD_NONSPINNABLE (1UL << 1)
+#define RWSEM_WR_NONSPINNABLE (1UL << 2)
+#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE)
+#define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE)
+
+#ifdef CONFIG_DEBUG_RWSEMS
+# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
+ if (!debug_locks_silent && \
+ WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+ #c, atomic_long_read(&(sem)->count), \
+ atomic_long_read(&(sem)->owner), (long)current, \
+ list_empty(&(sem)->wait_list) ? "" : "not ")) \
+ debug_locks_off(); \
+ } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+
+/*
+ * On 64-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-62 - 55-bit reader count
+ * Bit 63 - read fail bit
+ *
+ * On 32-bit architectures, the bit definitions of the count are:
+ *
+ * Bit 0 - writer locked bit
+ * Bit 1 - waiters present bit
+ * Bit 2 - lock handoff bit
+ * Bits 3-7 - reserved
+ * Bits 8-30 - 23-bit reader count
+ * Bit 31 - read fail bit
+ *
+ * It is not likely that the most significant bit (read fail bit) will ever
+ * be set. This guard bit is still checked anyway in the down_read() fastpath
+ * just in case we need to use up more of the reader bits for other purpose
+ * in the future.
+ *
+ * atomic_long_fetch_add() is used to obtain reader lock, whereas
+ * atomic_long_cmpxchg() will be used to obtain writer lock.
+ *
+ * There are three places where the lock handoff bit may be set or cleared.
+ * 1) rwsem_mark_wake() for readers.
+ * 2) rwsem_try_write_lock() for writers.
+ * 3) Error path of rwsem_down_write_slowpath().
+ *
+ * For all the above cases, wait_lock will be held. A writer must also
+ * be the first one in the wait_list to be eligible for setting the handoff
+ * bit. So concurrent setting/clearing of handoff bit is not possible.
+ */
+#define RWSEM_WRITER_LOCKED (1UL << 0)
+#define RWSEM_FLAG_WAITERS (1UL << 1)
+#define RWSEM_FLAG_HANDOFF (1UL << 2)
+#define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1))
+
+#define RWSEM_READER_SHIFT 8
+#define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT)
+#define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1))
+#define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED
+#define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK)
+#define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\
+ RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL)
+
+/*
+ * All writes to owner are protected by WRITE_ONCE() to make sure that
+ * store tearing can't happen as optimistic spinners may read and use
+ * the owner value concurrently without lock. Read from owner, however,
+ * may not need READ_ONCE() as long as the pointer value is only used
+ * for comparison and isn't being dereferenced.
+ */
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ atomic_long_set(&sem->owner, (long)current);
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ atomic_long_set(&sem->owner, 0);
+}
+
+/*
+ * Test the flags in the owner field.
+ */
+static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags)
+{
+ return atomic_long_read(&sem->owner) & flags;
+}
+
+/*
+ * The task_struct pointer of the last owning reader will be left in
+ * the owner field.
+ *
+ * Note that the owner value just indicates the task has owned the rwsem
+ * previously, it may not be the real owner or one of the real owners
+ * anymore when that field is examined, so take it with a grain of salt.
+ *
+ * The reader non-spinnable bit is preserved.
+ */
+static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED |
+ (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE);
+
+ atomic_long_set(&sem->owner, val);
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ __rwsem_set_reader_owned(sem, current);
+}
+
+/*
+ * Return true if the rwsem is owned by a reader.
+ */
+static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem)
+{
+#ifdef CONFIG_DEBUG_RWSEMS
+ /*
+ * Check the count to see if it is write-locked.
+ */
+ long count = atomic_long_read(&sem->count);
+
+ if (count & RWSEM_WRITER_MASK)
+ return false;
+#endif
+ return rwsem_test_oflags(sem, RWSEM_READER_OWNED);
+}
+
+#ifdef CONFIG_DEBUG_RWSEMS
+/*
+ * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
+ * is a task pointer in owner of a reader-owned rwsem, it will be the
+ * real owner or one of the real owners. The only exception is when the
+ * unlock is done by up_read_non_owner().
+ */
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+ unsigned long val = atomic_long_read(&sem->owner);
+
+ while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) {
+ if (atomic_long_try_cmpxchg(&sem->owner, &val,
+ val & RWSEM_OWNER_FLAGS_MASK))
+ return;
+ }
+}
+#else
+static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
+{
+}
+#endif
+
+/*
+ * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag
+ * remains set. Otherwise, the operation will be aborted.
+ */
+static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ do {
+ if (!(owner & RWSEM_READER_OWNED))
+ break;
+ if (owner & RWSEM_NONSPINNABLE)
+ break;
+ } while (!atomic_long_try_cmpxchg(&sem->owner, &owner,
+ owner | RWSEM_NONSPINNABLE));
+}
+
+static inline bool rwsem_read_trylock(struct rw_semaphore *sem)
+{
+ long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count);
+ if (WARN_ON_ONCE(cnt < 0))
+ rwsem_set_nonspinnable(sem);
+ return !(cnt & RWSEM_READ_FAILED_MASK);
+}
+
+/*
+ * Return just the real task structure pointer of the owner
+ */
+static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem)
+{
+ return (struct task_struct *)
+ (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Return the real task structure pointer of the owner and the embedded
+ * flags in the owner. pflags must be non-NULL.
+ */
+static inline struct task_struct *
+rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ *pflags = owner & RWSEM_OWNER_FLAGS_MASK;
+ return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK);
+}
+
+/*
+ * Guide to the rw_semaphore's count field.
+ *
+ * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned
+ * by a writer.
+ *
+ * The lock is owned by readers when
+ * (1) the RWSEM_WRITER_LOCKED isn't set in count,
+ * (2) some of the reader bits are set in count, and
+ * (3) the owner field has RWSEM_READ_OWNED bit set.
+ *
+ * Having some reader bits set is not enough to guarantee a readers owned
+ * lock as the readers may be in the process of backing out from the count
+ * and a writer has just released the lock. So another writer may steal
+ * the lock immediately after that.
+ */
+
+/*
+ * Initialize an rwsem:
+ */
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ /*
+ * Make sure we are not reinitializing a held semaphore:
+ */
+ debug_check_no_locks_freed((void *)sem, sizeof(*sem));
+ lockdep_init_map(&sem->dep_map, name, key, 0);
+#endif
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
+ raw_spin_lock_init(&sem->wait_lock);
+ INIT_LIST_HEAD(&sem->wait_list);
+ atomic_long_set(&sem->owner, 0L);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ osq_lock_init(&sem->osq);
+#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
+
+enum rwsem_waiter_type {
+ RWSEM_WAITING_FOR_WRITE,
+ RWSEM_WAITING_FOR_READ
+};
+
+struct rwsem_waiter {
+ struct list_head list;
+ struct task_struct *task;
+ enum rwsem_waiter_type type;
+ unsigned long timeout;
+ unsigned long last_rowner;
+};
+#define rwsem_first_waiter(sem) \
+ list_first_entry(&sem->wait_list, struct rwsem_waiter, list)
+
+enum rwsem_wake_type {
+ RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */
+ RWSEM_WAKE_READERS, /* Wake readers only */
+ RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
+};
+
+enum writer_wait_state {
+ WRITER_NOT_FIRST, /* Writer is not first in wait list */
+ WRITER_FIRST, /* Writer is first in wait list */
+ WRITER_HANDOFF /* Writer is first & handoff needed */
+};
+
+/*
+ * The typical HZ value is either 250 or 1000. So set the minimum waiting
+ * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait
+ * queue before initiating the handoff protocol.
+ */
+#define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250)
+
+/*
+ * Magic number to batch-wakeup waiting readers, even when writers are
+ * also present in the queue. This both limits the amount of work the
+ * waking thread must do and also prevents any potential counter overflow,
+ * however unlikely.
+ */
+#define MAX_READERS_WAKEUP 0x100
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must
+ * have been set.
+ * - there must be someone on the queue
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
+ * - woken process blocks are discarded from the list after having task zeroed
+ * - writers are only marked woken if downgrading is false
+ */
+static void rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type,
+ struct wake_q_head *wake_q)
+{
+ struct rwsem_waiter *waiter, *tmp;
+ long oldcount, woken = 0, adjustment = 0;
+ struct list_head wlist;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ /*
+ * Take a peek at the queue head waiter such that we can determine
+ * the wakeup(s) to perform.
+ */
+ waiter = rwsem_first_waiter(sem);
+
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
+ */
+ wake_q_add(wake_q, waiter->task);
+ lockevent_inc(rwsem_wake_writer);
+ }
+
+ return;
+ }
+
+ /*
+ * No reader wakeup if there are too many of them already.
+ */
+ if (unlikely(atomic_long_read(&sem->count) < 0))
+ return;
+
+ /*
+ * Writers might steal the lock before we grant it to the next reader.
+ * We prefer to do the first reader grant before counting readers
+ * so we can bail out early if a writer stole the lock.
+ */
+ if (wake_type != RWSEM_WAKE_READ_OWNED) {
+ struct task_struct *owner;
+
+ adjustment = RWSEM_READER_BIAS;
+ oldcount = atomic_long_fetch_add(adjustment, &sem->count);
+ if (unlikely(oldcount & RWSEM_WRITER_MASK)) {
+ /*
+ * When we've been waiting "too" long (for writers
+ * to give up the lock), request a HANDOFF to
+ * force the issue.
+ */
+ if (!(oldcount & RWSEM_FLAG_HANDOFF) &&
+ time_after(jiffies, waiter->timeout)) {
+ adjustment -= RWSEM_FLAG_HANDOFF;
+ lockevent_inc(rwsem_rlock_handoff);
+ }
+
+ atomic_long_add(-adjustment, &sem->count);
+ return;
+ }
+ /*
+ * Set it to reader-owned to give spinners an early
+ * indication that readers now have the lock.
+ * The reader nonspinnable bit seen at slowpath entry of
+ * the reader is copied over.
+ */
+ owner = waiter->task;
+ if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) {
+ owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE);
+ lockevent_inc(rwsem_opt_norspin);
+ }
+ __rwsem_set_reader_owned(sem, owner);
+ }
+
+ /*
+ * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the
+ * queue. We know that the woken will be at least 1 as we accounted
+ * for above. Note we increment the 'active part' of the count by the
+ * number of readers before waking any processes up.
+ *
+ * This is an adaptation of the phase-fair R/W locks where at the
+ * reader phase (first waiter is a reader), all readers are eligible
+ * to acquire the lock at the same time irrespective of their order
+ * in the queue. The writers acquire the lock according to their
+ * order in the queue.
+ *
+ * We have to do wakeup in 2 passes to prevent the possibility that
+ * the reader count may be decremented before it is incremented. It
+ * is because the to-be-woken waiter may not have slept yet. So it
+ * may see waiter->task got cleared, finish its critical section and
+ * do an unlock before the reader count increment.
+ *
+ * 1) Collect the read-waiters in a separate list, count them and
+ * fully increment the reader count in rwsem.
+ * 2) For each waiters in the new list, clear waiter->task and
+ * put them into wake_q to be woken up later.
+ */
+ INIT_LIST_HEAD(&wlist);
+ list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
+ if (waiter->type == RWSEM_WAITING_FOR_WRITE)
+ continue;
+
+ woken++;
+ list_move_tail(&waiter->list, &wlist);
+
+ /*
+ * Limit # of readers that can be woken up per wakeup call.
+ */
+ if (woken >= MAX_READERS_WAKEUP)
+ break;
+ }
+
+ adjustment = woken * RWSEM_READER_BIAS - adjustment;
+ lockevent_cond_inc(rwsem_wake_reader, woken);
+ if (list_empty(&sem->wait_list)) {
+ /* hit end of list above */
+ adjustment -= RWSEM_FLAG_WAITERS;
+ }
+
+ /*
+ * When we've woken a reader, we no longer need to force writers
+ * to give up the lock and we can clear HANDOFF.
+ */
+ if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF))
+ adjustment -= RWSEM_FLAG_HANDOFF;
+
+ if (adjustment)
+ atomic_long_add(adjustment, &sem->count);
+
+ /* 2nd pass */
+ list_for_each_entry_safe(waiter, tmp, &wlist, list) {
+ struct task_struct *tsk;
+
+ tsk = waiter->task;
+ get_task_struct(tsk);
+
+ /*
+ * Ensure calling get_task_struct() before setting the reader
+ * waiter to nil such that rwsem_down_read_slowpath() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
+ */
+ smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add_safe(wake_q, tsk);
+ }
+}
+
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ *
+ * If wstate is WRITER_HANDOFF, it will make sure that either the handoff
+ * bit is set or the lock is acquired with handoff bit cleared.
+ */
+static inline bool rwsem_try_write_lock(struct rw_semaphore *sem,
+ enum writer_wait_state wstate)
+{
+ long count, new;
+
+ lockdep_assert_held(&sem->wait_lock);
+
+ count = atomic_long_read(&sem->count);
+ do {
+ bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF);
+
+ if (has_handoff && wstate == WRITER_NOT_FIRST)
+ return false;
+
+ new = count;
+
+ if (count & RWSEM_LOCK_MASK) {
+ if (has_handoff || (wstate != WRITER_HANDOFF))
+ return false;
+
+ new |= RWSEM_FLAG_HANDOFF;
+ } else {
+ new |= RWSEM_WRITER_LOCKED;
+ new &= ~RWSEM_FLAG_HANDOFF;
+
+ if (list_is_singular(&sem->wait_list))
+ new &= ~RWSEM_FLAG_WAITERS;
+ }
+ } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new));
+
+ /*
+ * We have either acquired the lock with handoff bit cleared or
+ * set the handoff bit.
+ */
+ if (new & RWSEM_FLAG_HANDOFF)
+ return false;
+
+ rwsem_set_owner(sem);
+ return true;
+}
+
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * Try to acquire read lock before the reader is put on wait queue.
+ * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff
+ * is ongoing.
+ */
+static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+
+ if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))
+ return false;
+
+ count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count);
+ if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_opt_rlock);
+ return true;
+ }
+
+ /* Back out the change */
+ atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+ return false;
+}
+
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+
+ while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
+ count | RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ lockevent_inc(rwsem_opt_wlock);
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline bool owner_on_cpu(struct task_struct *owner)
+{
+ /*
+ * As lock holder preemption issue, we both skip spinning if
+ * task is not on cpu or its cpu is preempted
+ */
+ return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ unsigned long nonspinnable)
+{
+ struct task_struct *owner;
+ unsigned long flags;
+ bool ret = true;
+
+ BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE));
+
+ if (need_resched()) {
+ lockevent_inc(rwsem_opt_fail);
+ return false;
+ }
+
+ preempt_disable();
+ rcu_read_lock();
+ owner = rwsem_owner_flags(sem, &flags);
+ if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
+ ret = false;
+ rcu_read_unlock();
+ preempt_enable();
+
+ lockevent_cond_inc(rwsem_opt_fail, !ret);
+ return ret;
+}
+
+/*
+ * The rwsem_spin_on_owner() function returns the folowing 4 values
+ * depending on the lock owner state.
+ * OWNER_NULL : owner is currently NULL
+ * OWNER_WRITER: when owner changes and is a writer
+ * OWNER_READER: when owner changes and the new owner may be a reader.
+ * OWNER_NONSPINNABLE:
+ * when optimistic spinning has to stop because either the
+ * owner stops running, is unknown, or its timeslice has
+ * been used up.
+ */
+enum owner_state {
+ OWNER_NULL = 1 << 0,
+ OWNER_WRITER = 1 << 1,
+ OWNER_READER = 1 << 2,
+ OWNER_NONSPINNABLE = 1 << 3,
+};
+#define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER)
+
+static inline enum owner_state
+rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable)
+{
+ if (flags & nonspinnable)
+ return OWNER_NONSPINNABLE;
+
+ if (flags & RWSEM_READER_OWNED)
+ return OWNER_READER;
+
+ return owner ? OWNER_WRITER : OWNER_NULL;
+}
+
+static noinline enum owner_state
+rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
+{
+ struct task_struct *new, *owner;
+ unsigned long flags, new_flags;
+ enum owner_state state;
+
+ owner = rwsem_owner_flags(sem, &flags);
+ state = rwsem_owner_state(owner, flags, nonspinnable);
+ if (state != OWNER_WRITER)
+ return state;
+
+ rcu_read_lock();
+ for (;;) {
+ if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) {
+ state = OWNER_NONSPINNABLE;
+ break;
+ }
+
+ new = rwsem_owner_flags(sem, &new_flags);
+ if ((new != owner) || (new_flags != flags)) {
+ state = rwsem_owner_state(new, new_flags, nonspinnable);
+ break;
+ }
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_
+ * checking sem->owner still matches owner, if that fails,
+ * owner might point to free()d memory, if it still matches,
+ * the rcu_read_lock() ensures the memory stays valid.
+ */
+ barrier();
+
+ if (need_resched() || !owner_on_cpu(owner)) {
+ state = OWNER_NONSPINNABLE;
+ break;
+ }
+
+ cpu_relax();
+ }
+ rcu_read_unlock();
+
+ return state;
+}
+
+/*
+ * Calculate reader-owned rwsem spinning threshold for writer
+ *
+ * The more readers own the rwsem, the longer it will take for them to
+ * wind down and free the rwsem. So the empirical formula used to
+ * determine the actual spinning time limit here is:
+ *
+ * Spinning threshold = (10 + nr_readers/2)us
+ *
+ * The limit is capped to a maximum of 25us (30 readers). This is just
+ * a heuristic and is subjected to change in the future.
+ */
+static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem)
+{
+ long count = atomic_long_read(&sem->count);
+ int readers = count >> RWSEM_READER_SHIFT;
+ u64 delta;
+
+ if (readers > 30)
+ readers = 30;
+ delta = (20 + readers) * NSEC_PER_USEC / 2;
+
+ return sched_clock() + delta;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+{
+ bool taken = false;
+ int prev_owner_state = OWNER_NULL;
+ int loop = 0;
+ u64 rspin_threshold = 0;
+ unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE
+ : RWSEM_RD_NONSPINNABLE;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock and spinning time has exceeded limit.
+ */
+ for (;;) {
+ enum owner_state owner_state;
+
+ owner_state = rwsem_spin_on_owner(sem, nonspinnable);
+ if (!(owner_state & OWNER_SPINNABLE))
+ break;
+
+ /*
+ * Try to acquire the lock
+ */
+ taken = wlock ? rwsem_try_write_lock_unqueued(sem)
+ : rwsem_try_read_lock_unqueued(sem);
+
+ if (taken)
+ break;
+
+ /*
+ * Time-based reader-owned rwsem optimistic spinning
+ */
+ if (wlock && (owner_state == OWNER_READER)) {
+ /*
+ * Re-initialize rspin_threshold every time when
+ * the owner state changes from non-reader to reader.
+ * This allows a writer to steal the lock in between
+ * 2 reader phases and have the threshold reset at
+ * the beginning of the 2nd reader phase.
+ */
+ if (prev_owner_state != OWNER_READER) {
+ if (rwsem_test_oflags(sem, nonspinnable))
+ break;
+ rspin_threshold = rwsem_rspin_threshold(sem);
+ loop = 0;
+ }
+
+ /*
+ * Check time threshold once every 16 iterations to
+ * avoid calling sched_clock() too frequently so
+ * as to reduce the average latency between the times
+ * when the lock becomes free and when the spinner
+ * is ready to do a trylock.
+ */
+ else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) {
+ rwsem_set_nonspinnable(sem);
+ lockevent_inc(rwsem_opt_nospin);
+ break;
+ }
+ }
+
+ /*
+ * An RT task cannot do optimistic spinning if it cannot
+ * be sure the lock holder is running or live-lock may
+ * happen if the current task and the lock holder happen
+ * to run in the same CPU. However, aborting optimistic
+ * spinning while a NULL owner is detected may miss some
+ * opportunity where spinning can continue without causing
+ * problem.
+ *
+ * There are 2 possible cases where an RT task may be able
+ * to continue spinning.
+ *
+ * 1) The lock owner is in the process of releasing the
+ * lock, sem->owner is cleared but the lock has not
+ * been released yet.
+ * 2) The lock was free and owner cleared, but another
+ * task just comes in and acquire the lock before
+ * we try to get it. The new owner may be a spinnable
+ * writer.
+ *
+ * To take advantage of two scenarios listed agove, the RT
+ * task is made to retry one more time to see if it can
+ * acquire the lock or continue spinning on the new owning
+ * writer. Of course, if the time lag is long enough or the
+ * new owner is not a writer or spinnable, the RT task will
+ * quit spinning.
+ *
+ * If the owner is a writer, the need_resched() check is
+ * done inside rwsem_spin_on_owner(). If the owner is not
+ * a writer, need_resched() check needs to be done here.
+ */
+ if (owner_state != OWNER_WRITER) {
+ if (need_resched())
+ break;
+ if (rt_task(current) &&
+ (prev_owner_state != OWNER_WRITER))
+ break;
+ }
+ prev_owner_state = owner_state;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ lockevent_cond_inc(rwsem_opt_fail, !taken);
+ return taken;
+}
+
+/*
+ * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should
+ * only be called when the reader count reaches 0.
+ *
+ * This give writers better chance to acquire the rwsem first before
+ * readers when the rwsem was being held by readers for a relatively long
+ * period of time. Race can happen that an optimistic spinner may have
+ * just stolen the rwsem and set the owner, but just clearing the
+ * RWSEM_WR_NONSPINNABLE bit will do no harm anyway.
+ */
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem)
+{
+ if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE))
+ atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner);
+}
+
+/*
+ * This function is called when the reader fails to acquire the lock via
+ * optimistic spinning. In this case we will still attempt to do a trylock
+ * when comparing the rwsem state right now with the state when entering
+ * the slowpath indicates that the reader is still in a valid reader phase.
+ * This happens when the following conditions are true:
+ *
+ * 1) The lock is currently reader owned, and
+ * 2) The lock is previously not reader-owned or the last read owner changes.
+ *
+ * In the former case, we have transitioned from a writer phase to a
+ * reader-phase while spinning. In the latter case, it means the reader
+ * phase hasn't ended when we entered the optimistic spinning loop. In
+ * both cases, the reader is eligible to acquire the lock. This is the
+ * secondary path where a read lock is acquired optimistically.
+ *
+ * The reader non-spinnable bit wasn't set at time of entry or it will
+ * not be here at all.
+ */
+static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
+ unsigned long last_rowner)
+{
+ unsigned long owner = atomic_long_read(&sem->owner);
+
+ if (!(owner & RWSEM_READER_OWNED))
+ return false;
+
+ if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) &&
+ rwsem_try_read_lock_unqueued(sem)) {
+ lockevent_inc(rwsem_opt_rlock2);
+ lockevent_add(rwsem_opt_fail, -1);
+ return true;
+ }
+ return false;
+}
+#else
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
+ unsigned long nonspinnable)
+{
+ return false;
+}
+
+static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock)
+{
+ return false;
+}
+
+static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { }
+
+static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem,
+ unsigned long last_rowner)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait for the read lock to be granted
+ */
+static struct rw_semaphore __sched *
+rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
+{
+ long count, adjustment = -RWSEM_READER_BIAS;
+ struct rwsem_waiter waiter;
+ DEFINE_WAKE_Q(wake_q);
+ bool wake = false;
+
+ /*
+ * Save the current read-owner of rwsem, if available, and the
+ * reader nonspinnable bit.
+ */
+ waiter.last_rowner = atomic_long_read(&sem->owner);
+ if (!(waiter.last_rowner & RWSEM_READER_OWNED))
+ waiter.last_rowner &= RWSEM_RD_NONSPINNABLE;
+
+ if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE))
+ goto queue;
+
+ /*
+ * Undo read bias from down_read() and do optimistic spinning.
+ */
+ atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
+ adjustment = 0;
+ if (rwsem_optimistic_spin(sem, false)) {
+ /*
+ * Wake up other readers in the wait list if the front
+ * waiter is a reader.
+ */
+ if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED,
+ &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ }
+ return sem;
+ } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
+ return sem;
+ }
+
+queue:
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_READ;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (list_empty(&sem->wait_list)) {
+ /*
+ * In case the wait queue is empty and the lock isn't owned
+ * by a writer or has the handoff bit set, this reader can
+ * exit the slowpath and return immediately as its
+ * RWSEM_READER_BIAS has already been set in the count.
+ */
+ if (adjustment && !(atomic_long_read(&sem->count) &
+ (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ raw_spin_unlock_irq(&sem->wait_lock);
+ rwsem_set_reader_owned(sem);
+ lockevent_inc(rwsem_rlock_fast);
+ return sem;
+ }
+ adjustment += RWSEM_FLAG_WAITERS;
+ }
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock, but no longer actively locking */
+ if (adjustment)
+ count = atomic_long_add_return(adjustment, &sem->count);
+ else
+ count = atomic_long_read(&sem->count);
+
+ /*
+ * If there are no active locks, wake the front queued process(es).
+ *
+ * If there are no writers and we are first in the queue,
+ * wake our own waiter to join the existing active readers !
+ */
+ if (!(count & RWSEM_LOCK_MASK)) {
+ clear_wr_nonspinnable(sem);
+ wake = true;
+ }
+ if (wake || (!(count & RWSEM_WRITER_MASK) &&
+ (adjustment & RWSEM_FLAG_WAITERS)))
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+
+ /* wait to be given the lock */
+ while (true) {
+ set_current_state(state);
+ if (!waiter.task)
+ break;
+ if (signal_pending_state(state, current)) {
+ raw_spin_lock_irq(&sem->wait_lock);
+ if (waiter.task)
+ goto out_nolock;
+ raw_spin_unlock_irq(&sem->wait_lock);
+ break;
+ }
+ schedule();
+ lockevent_inc(rwsem_sleep_reader);
+ }
+
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock);
+ return sem;
+out_nolock:
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list)) {
+ atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF,
+ &sem->count);
+ }
+ raw_spin_unlock_irq(&sem->wait_lock);
+ __set_current_state(TASK_RUNNING);
+ lockevent_inc(rwsem_rlock_fail);
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * This function is called by the a write lock owner. So the owner value
+ * won't get changed by others.
+ */
+static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem,
+ bool disable)
+{
+ if (unlikely(disable)) {
+ atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner);
+ lockevent_inc(rwsem_opt_norspin);
+ }
+}
+
+/*
+ * Wait until we successfully acquire the write lock
+ */
+static struct rw_semaphore *
+rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
+{
+ long count;
+ bool disable_rspin;
+ enum writer_wait_state wstate;
+ struct rwsem_waiter waiter;
+ struct rw_semaphore *ret = sem;
+ DEFINE_WAKE_Q(wake_q);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
+ rwsem_optimistic_spin(sem, true))
+ return sem;
+
+ /*
+ * Disable reader optimistic spinning for this rwsem after
+ * acquiring the write lock when the setting of the nonspinnable
+ * bits are observed.
+ */
+ disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
+ waiter.type = RWSEM_WAITING_FOR_WRITE;
+ waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT;
+
+ raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
+ wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST;
+
+ list_add_tail(&waiter.list, &sem->wait_list);
+
+ /* we're now waiting on the lock */
+ if (wstate == WRITER_NOT_FIRST) {
+ count = atomic_long_read(&sem->count);
+
+ /*
+ * If there were already threads queued before us and:
+ * 1) there are no no active locks, wake the front
+ * queued process(es) as the handoff bit might be set.
+ * 2) there are no active writers and some readers, the lock
+ * must be read owned; so we try to wake any read lock
+ * waiters that were queued ahead of us.
+ */
+ if (count & RWSEM_WRITER_MASK)
+ goto wait;
+
+ rwsem_mark_wake(sem, (count & RWSEM_READER_MASK)
+ ? RWSEM_WAKE_READERS
+ : RWSEM_WAKE_ANY, &wake_q);
+
+ if (!wake_q_empty(&wake_q)) {
+ /*
+ * We want to minimize wait_lock hold time especially
+ * when a large number of readers are to be woken up.
+ */
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ wake_q_init(&wake_q); /* Used again, reinit */
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ } else {
+ atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count);
+ }
+
+wait:
+ /* wait until we successfully acquire the lock */
+ set_current_state(state);
+ while (true) {
+ if (rwsem_try_write_lock(sem, wstate))
+ break;
+
+ raw_spin_unlock_irq(&sem->wait_lock);
+
+ /* Block until there are no active lockers. */
+ for (;;) {
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
+ schedule();
+ lockevent_inc(rwsem_sleep_writer);
+ set_current_state(state);
+ /*
+ * If HANDOFF bit is set, unconditionally do
+ * a trylock.
+ */
+ if (wstate == WRITER_HANDOFF)
+ break;
+
+ if ((wstate == WRITER_NOT_FIRST) &&
+ (rwsem_first_waiter(sem) == &waiter))
+ wstate = WRITER_FIRST;
+
+ count = atomic_long_read(&sem->count);
+ if (!(count & RWSEM_LOCK_MASK))
+ break;
+
+ /*
+ * The setting of the handoff bit is deferred
+ * until rwsem_try_write_lock() is called.
+ */
+ if ((wstate == WRITER_FIRST) && (rt_task(current) ||
+ time_after(jiffies, waiter.timeout))) {
+ wstate = WRITER_HANDOFF;
+ lockevent_inc(rwsem_wlock_handoff);
+ break;
+ }
+ }
+
+ raw_spin_lock_irq(&sem->wait_lock);
+ }
+ __set_current_state(TASK_RUNNING);
+ list_del(&waiter.list);
+ rwsem_disable_reader_optspin(sem, disable_rspin);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ lockevent_inc(rwsem_wlock);
+
+ return ret;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
+ list_del(&waiter.list);
+
+ if (unlikely(wstate == WRITER_HANDOFF))
+ atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count);
+
+ if (list_empty(&sem->wait_list))
+ atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count);
+ else
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+ lockevent_inc(rwsem_wlock_fail);
+
+ return ERR_PTR(-EINTR);
+}
+
+/*
+ * handle waking up a waiter on the semaphore
+ * - up_read/up_write has decremented the active part of count if we come here
+ */
+static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - caller incremented waiting part of count and discovered it still negative
+ * - just wake up any readers at the front of the queue
+ */
+static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
+{
+ unsigned long flags;
+ DEFINE_WAKE_Q(wake_q);
+
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+
+ if (!list_empty(&sem->wait_list))
+ rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
+
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
+
+ return sem;
+}
+
+/*
+ * lock for reading
+ */
+inline void __down_read(struct rw_semaphore *sem)
+{
+ if (!rwsem_read_trylock(sem)) {
+ rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE);
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ } else {
+ rwsem_set_reader_owned(sem);
+ }
+}
+
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+ if (!rwsem_read_trylock(sem)) {
+ if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE)))
+ return -EINTR;
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ } else {
+ rwsem_set_reader_owned(sem);
+ }
+ return 0;
+}
+
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+ /*
+ * Optimize for the case when the rwsem is not locked at all.
+ */
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ do {
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ tmp + RWSEM_READER_BIAS)) {
+ rwsem_set_reader_owned(sem);
+ return 1;
+ }
+ } while (!(tmp & RWSEM_READ_FAILED_MASK));
+ return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ RWSEM_WRITER_LOCKED)))
+ rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE);
+ else
+ rwsem_set_owner(sem);
+}
+
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ RWSEM_WRITER_LOCKED))) {
+ if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE)))
+ return -EINTR;
+ } else {
+ rwsem_set_owner(sem);
+ }
+ return 0;
+}
+
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+ long tmp = RWSEM_UNLOCKED_VALUE;
+
+ if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+ RWSEM_WRITER_LOCKED)) {
+ rwsem_set_owner(sem);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * unlock after reading
+ */
+inline void __up_read(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
+ rwsem_clear_reader_owned(sem);
+ tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count);
+ DEBUG_RWSEMS_WARN_ON(tmp < 0, sem);
+ if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) ==
+ RWSEM_FLAG_WAITERS)) {
+ clear_wr_nonspinnable(sem);
+ rwsem_wake(sem, tmp);
+ }
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * sem->owner may differ from current if the ownership is transferred
+ * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits.
+ */
+ DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) &&
+ !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem);
+ rwsem_clear_owner(sem);
+ tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count);
+ if (unlikely(tmp & RWSEM_FLAG_WAITERS))
+ rwsem_wake(sem, tmp);
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+ long tmp;
+
+ /*
+ * When downgrading from exclusive to shared ownership,
+ * anything inside the write-locked region cannot leak
+ * into the read side. In contrast, anything in the
+ * read-locked region is ok to be re-ordered into the
+ * write side. As such, rely on RELEASE semantics.
+ */
+ DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem);
+ tmp = atomic_long_fetch_add_release(
+ -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count);
+ rwsem_set_reader_owned(sem);
+ if (tmp & RWSEM_FLAG_WAITERS)
+ rwsem_downgrade_wake(sem);
+}
/*
* lock for reading
@@ -25,7 +1446,6 @@ void __sched down_read(struct rw_semaphore *sem)
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
}
-
EXPORT_SYMBOL(down_read);
int __sched down_read_killable(struct rw_semaphore *sem)
@@ -40,7 +1460,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
return 0;
}
-
EXPORT_SYMBOL(down_read_killable);
/*
@@ -54,7 +1473,6 @@ int down_read_trylock(struct rw_semaphore *sem)
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
-
EXPORT_SYMBOL(down_read_trylock);
/*
@@ -64,10 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
}
-
EXPORT_SYMBOL(down_write);
/*
@@ -78,14 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem)
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
- if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
rwsem_release(&sem->dep_map, 1, _RET_IP_);
return -EINTR;
}
return 0;
}
-
EXPORT_SYMBOL(down_write_killable);
/*
@@ -100,7 +1516,6 @@ int down_write_trylock(struct rw_semaphore *sem)
return ret;
}
-
EXPORT_SYMBOL(down_write_trylock);
/*
@@ -109,10 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock);
void up_read(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
-
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read);
/*
@@ -121,10 +1534,8 @@ EXPORT_SYMBOL(up_read);
void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
-
__up_write(sem);
}
-
EXPORT_SYMBOL(up_write);
/*
@@ -133,10 +1544,8 @@ EXPORT_SYMBOL(up_write);
void downgrade_write(struct rw_semaphore *sem)
{
lock_downgrade(&sem->dep_map, _RET_IP_);
-
__downgrade_write(sem);
}
-
EXPORT_SYMBOL(downgrade_write);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -145,40 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
}
-
EXPORT_SYMBOL(down_read_nested);
void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
{
might_sleep();
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
}
-
EXPORT_SYMBOL(_down_write_nest_lock);
void down_read_non_owner(struct rw_semaphore *sem)
{
might_sleep();
-
__down_read(sem);
__rwsem_set_reader_owned(sem, NULL);
}
-
EXPORT_SYMBOL(down_read_non_owner);
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
-
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
}
-
EXPORT_SYMBOL(down_write_nested);
int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
@@ -186,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
might_sleep();
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
- if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+ if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock,
+ __down_write_killable)) {
rwsem_release(&sem->dep_map, 1, _RET_IP_);
return -EINTR;
}
return 0;
}
-
EXPORT_SYMBOL(down_write_killable_nested);
void up_read_non_owner(struct rw_semaphore *sem)
{
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
- sem);
+ DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem);
__up_read(sem);
}
-
EXPORT_SYMBOL(up_read_non_owner);
#endif
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 64877f5294e3..2534ce49f648 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,304 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * The least significant 2 bits of the owner value has the following
- * meanings when set.
- * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers
- * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned,
- * i.e. the owner(s) cannot be readily determined. It can be reader
- * owned or the owning writer is indeterminate.
- *
- * When a writer acquires a rwsem, it puts its task_struct pointer
- * into the owner field. It is cleared after an unlock.
- *
- * When a reader acquires a rwsem, it will also puts its task_struct
- * pointer into the owner field with both the RWSEM_READER_OWNED and
- * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will
- * largely be left untouched. So for a free or reader-owned rwsem,
- * the owner value may contain information about the last reader that
- * acquires the rwsem. The anonymous bit is set because that particular
- * reader may or may not still own the lock.
- *
- * That information may be helpful in debugging cases where the system
- * seems to hang on a reader owned rwsem especially if only one reader
- * is involved. Ideally we would like to track all the readers that own
- * a rwsem, but the overhead is simply too big.
- */
-#include "lock_events.h"
-#define RWSEM_READER_OWNED (1UL << 0)
-#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
+#ifndef __INTERNAL_RWSEM_H
+#define __INTERNAL_RWSEM_H
+#include <linux/rwsem.h>
-#ifdef CONFIG_DEBUG_RWSEMS
-# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
- if (!debug_locks_silent && \
- WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
- #c, atomic_long_read(&(sem)->count), \
- (long)((sem)->owner), (long)current, \
- list_empty(&(sem)->wait_list) ? "" : "not ")) \
- debug_locks_off(); \
- } while (0)
-#else
-# define DEBUG_RWSEMS_WARN_ON(c, sem)
-#endif
+extern void __down_read(struct rw_semaphore *sem);
+extern void __up_read(struct rw_semaphore *sem);
-/*
- * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
- * Adapted largely from include/asm-i386/rwsem.h
- * by Paul Mackerras <paulus@samba.org>.
- */
-
-/*
- * the semaphore definition
- */
-#ifdef CONFIG_64BIT
-# define RWSEM_ACTIVE_MASK 0xffffffffL
-#else
-# define RWSEM_ACTIVE_MASK 0x0000ffffL
-#endif
-
-#define RWSEM_ACTIVE_BIAS 0x00000001L
-#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-/*
- * All writes to owner are protected by WRITE_ONCE() to make sure that
- * store tearing can't happen as optimistic spinners may read and use
- * the owner value concurrently without lock. Read from owner, however,
- * may not need READ_ONCE() as long as the pointer value is only used
- * for comparison and isn't being dereferenced.
- */
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
- WRITE_ONCE(sem->owner, current);
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
- WRITE_ONCE(sem->owner, NULL);
-}
-
-/*
- * The task_struct pointer of the last owning reader will be left in
- * the owner field.
- *
- * Note that the owner value just indicates the task has owned the rwsem
- * previously, it may not be the real owner or one of the real owners
- * anymore when that field is examined, so take it with a grain of salt.
- */
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
- unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED
- | RWSEM_ANONYMOUSLY_OWNED;
-
- WRITE_ONCE(sem->owner, (struct task_struct *)val);
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
- __rwsem_set_reader_owned(sem, current);
-}
-
-/*
- * Return true if the a rwsem waiter can spin on the rwsem's owner
- * and steal the lock, i.e. the lock is not anonymously owned.
- * N.B. !owner is considered spinnable.
- */
-static inline bool is_rwsem_owner_spinnable(struct task_struct *owner)
-{
- return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED);
-}
-
-/*
- * Return true if rwsem is owned by an anonymous writer or readers.
- */
-static inline bool rwsem_has_anonymous_owner(struct task_struct *owner)
-{
- return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED;
-}
-
-#ifdef CONFIG_DEBUG_RWSEMS
-/*
- * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there
- * is a task pointer in owner of a reader-owned rwsem, it will be the
- * real owner or one of the real owners. The only exception is when the
- * unlock is done by up_read_non_owner().
- */
-#define rwsem_clear_reader_owned rwsem_clear_reader_owned
-static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
-{
- unsigned long val = (unsigned long)current | RWSEM_READER_OWNED
- | RWSEM_ANONYMOUSLY_OWNED;
- if (READ_ONCE(sem->owner) == (struct task_struct *)val)
- cmpxchg_relaxed((unsigned long *)&sem->owner, val,
- RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED);
-}
-#endif
-
-#else
-static inline void rwsem_set_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void rwsem_clear_owner(struct rw_semaphore *sem)
-{
-}
-
-static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem,
- struct task_struct *owner)
-{
-}
-
-static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-
-#ifndef rwsem_clear_reader_owned
-static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
-{
-}
-#endif
-
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
-
-/*
- * lock for reading
- */
-static inline void __down_read(struct rw_semaphore *sem)
-{
- if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
- rwsem_down_read_failed(sem);
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
- RWSEM_READER_OWNED), sem);
- } else {
- rwsem_set_reader_owned(sem);
- }
-}
-
-static inline int __down_read_killable(struct rw_semaphore *sem)
-{
- if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
- if (IS_ERR(rwsem_down_read_failed_killable(sem)))
- return -EINTR;
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
- RWSEM_READER_OWNED), sem);
- } else {
- rwsem_set_reader_owned(sem);
- }
- return 0;
-}
-
-static inline int __down_read_trylock(struct rw_semaphore *sem)
-{
- /*
- * Optimize for the case when the rwsem is not locked at all.
- */
- long tmp = RWSEM_UNLOCKED_VALUE;
-
- lockevent_inc(rwsem_rtrylock);
- do {
- if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
- tmp + RWSEM_ACTIVE_READ_BIAS)) {
- rwsem_set_reader_owned(sem);
- return 1;
- }
- } while (tmp >= 0);
- return 0;
-}
-
-/*
- * lock for writing
- */
-static inline void __down_write(struct rw_semaphore *sem)
-{
- long tmp;
-
- tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
- &sem->count);
- if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
- rwsem_down_write_failed(sem);
- rwsem_set_owner(sem);
-}
-
-static inline int __down_write_killable(struct rw_semaphore *sem)
-{
- long tmp;
-
- tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
- &sem->count);
- if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
- if (IS_ERR(rwsem_down_write_failed_killable(sem)))
- return -EINTR;
- rwsem_set_owner(sem);
- return 0;
-}
-
-static inline int __down_write_trylock(struct rw_semaphore *sem)
-{
- long tmp;
-
- lockevent_inc(rwsem_wtrylock);
- tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
- RWSEM_ACTIVE_WRITE_BIAS);
- if (tmp == RWSEM_UNLOCKED_VALUE) {
- rwsem_set_owner(sem);
- return true;
- }
- return false;
-}
-
-/*
- * unlock after reading
- */
-static inline void __up_read(struct rw_semaphore *sem)
-{
- long tmp;
-
- DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
- sem);
- rwsem_clear_reader_owned(sem);
- tmp = atomic_long_dec_return_release(&sem->count);
- if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
- rwsem_wake(sem);
-}
-
-/*
- * unlock after writing
- */
-static inline void __up_write(struct rw_semaphore *sem)
-{
- DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
- rwsem_clear_owner(sem);
- if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
- &sem->count) < 0))
- rwsem_wake(sem);
-}
-
-/*
- * downgrade write lock to read lock
- */
-static inline void __downgrade_write(struct rw_semaphore *sem)
-{
- long tmp;
-
- /*
- * When downgrading from exclusive to shared ownership,
- * anything inside the write-locked region cannot leak
- * into the read side. In contrast, anything in the
- * read-locked region is ok to be re-ordered into the
- * write side. As such, rely on RELEASE semantics.
- */
- DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
- tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
- rwsem_set_reader_owned(sem);
- if (tmp < 0)
- rwsem_downgrade_wake(sem);
-}
+#endif /* __INTERNAL_RWSEM_H */
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 561acdd39960..d9dd94defc0a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
- * Distributed under the terms of the GNU GPL, version 2
- *
* This file implements counting semaphores.
* A counting semaphore may be acquired 'n' times before sleeping.
* See mutex.c for single-acquisition sleeping locks which enforce
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 65a3b7e55b9f..3e82f449b4ff 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Module-based API test facility for ww_mutexes
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
*/
#include <linux/kernel.h>
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 1490e63f69a9..6e1970719dc2 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data)
pgmap->kill(pgmap->ref);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
+ pgmap->cleanup(pgmap->ref);
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
@@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data)
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
*
- * 3/ pgmap->ref must be 'live' on entry and will be killed at
- * devm_memremap_pages_release() time, or if this routine fails.
+ * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
+ * at devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
- if (!pgmap->ref || !pgmap->kill)
+ if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
+ }
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
- return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ goto err_array;
}
conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL);
if (conflict_pgmap) {
dev_WARN(dev, "Conflicting mapping in same section\n");
put_dev_pagemap(conflict_pgmap);
- return ERR_PTR(-ENOMEM);
+ error = -ENOMEM;
+ goto err_array;
}
is_ram = region_intersects(align_start, align_size,
@@ -267,10 +272,18 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
pgmap_array_delete(res);
err_array:
pgmap->kill(pgmap->ref);
+ pgmap->cleanup(pgmap->ref);
+
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(devm_memremap_pages);
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
index d354341f8cc0..33783abc377b 100644
--- a/kernel/module-internal.h
+++ b/kernel/module-internal.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Module internals
*
* Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/elf.h>
diff --git a/kernel/module.c b/kernel/module.c
index 6e6712b3aaf5..a2cee14a83f3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
Copyright (C) 2002 Richard Henderson
Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/export.h>
#include <linux/extable.h>
@@ -3095,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->tracepoints_ptrs),
&mod->num_tracepoints);
#endif
+#ifdef CONFIG_TREE_SRCU
+ mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs",
+ sizeof(*mod->srcu_struct_ptrs),
+ &mod->num_srcu_structs);
+#endif
#ifdef CONFIG_BPF_EVENTS
mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map",
sizeof(*mod->bpf_raw_events),
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6b9a926fd86b..b10fb1986ca9 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Module signature checker
*
* Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/kernel.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index bfc95b3e4235..d9f5081d578d 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..c815f58e6bc0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2006 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- *
* Jun 2006 - namespaces support
* OpenVZ, SWsoft Inc.
* Pavel Emelianov <xemul@openvz.org>
diff --git a/kernel/panic.c b/kernel/panic.c
index b4543a31a495..4d9f55bf7d38 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/panic.c
*
diff --git a/kernel/params.c b/kernel/params.c
index ce89f757e6da..cf448785d058 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Helpers for initial module or kernel cmdline parsing
Copyright (C) 2001 Rusty Russell.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kernel.h>
#include <linux/string.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index 89548d35eefb..e5cad0c7d5dd 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic pidhash and scalable, time-bounded PID allocator
*
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index aa6e72fb7c08..f54bc7cb6c2d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Pid namespaces
*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9bbaaab14b36..ff8592ddedee 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7d66ee68aaaf..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
- cap = arch_scale_cpu_capacity(NULL, cpu);
+ cap = arch_scale_cpu_capacity(cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c8c272df7154..cd7434e6000d 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support.
*
@@ -6,8 +7,6 @@
* Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
* Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
* Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
*/
#define pr_fmt(fmt) "PM: " fmt
@@ -129,7 +128,7 @@ static int hibernation_test(int level) { return 0; }
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
- hibernation_ops->begin() : 0;
+ hibernation_ops->begin(PMSG_FREEZE) : 0;
}
/**
@@ -257,6 +256,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop,
(kps % 1000) / 10);
}
+__weak int arch_resume_nosmt(void)
+{
+ return 0;
+}
+
/**
* create_image - Create a hibernation image.
* @platform_mode: Whether or not to use the platform driver.
@@ -324,6 +328,10 @@ static int create_image(int platform_mode)
Enable_cpus:
suspend_enable_secondary_cpus();
+ /* Allow architectures to do nosmt-specific post-resume dances */
+ if (!in_suspend)
+ error = arch_resume_nosmt();
+
Platform_finish:
platform_finish(platform_mode);
@@ -542,7 +550,7 @@ int hibernation_platform_enter(void)
* hibernation_ops->finish() before saving the image, so we should let
* the firmware know that we're going to enter the sleep state after all
*/
- error = hibernation_ops->begin();
+ error = hibernation_ops->begin(PMSG_HIBERNATE);
if (error)
goto Close;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4f43e724f6eb..bdbd605c4215 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/main.c - PM subsystem core functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
- *
- * This file is released under the GPLv2
- *
*/
#include <linux/export.h>
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 7ef6866b521d..6d475281c730 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -1,7 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* poweroff.c - sysrq handler to gracefully power down machine.
- *
- * This file is released under the GPL v2
*/
#include <linux/kernel.h>
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 9d22131afc1e..33e3febaba53 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This module exposes the interface to kernel space for specifying
* QoS dependencies. It provides infrastructure for registration of:
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index bc9558ab1e5b..83105874f255 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/snapshot.c
*
@@ -5,9 +6,6 @@
*
* Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
*/
#define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ef908c134b34..096211299c07 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/suspend.c - Suspend to RAM and standby functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
* Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
*/
#define pr_fmt(fmt) "PM: " fmt
@@ -62,6 +61,12 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head);
enum s2idle_states __read_mostly s2idle_state;
static DEFINE_RAW_SPINLOCK(s2idle_lock);
+/**
+ * pm_suspend_via_s2idle - Check if suspend-to-idle is the default suspend.
+ *
+ * Return 'true' if suspend-to-idle has been selected as the default system
+ * suspend method.
+ */
bool pm_suspend_via_s2idle(void)
{
return mem_sleep_current == PM_SUSPEND_TO_IDLE;
@@ -488,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state)
pm_suspend_target_state = state;
+ if (state == PM_SUSPEND_TO_IDLE)
+ pm_set_suspend_no_platform();
+
error = platform_suspend_begin(state);
if (error)
goto Close;
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 6a897e8b2a88..60564b58de07 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/power/suspend_test.c - Suspend to RAM and standby test facility.
*
* Copyright (c) 2009 Pavel Machek <pavel@ucw.cz>
- *
- * This file is released under the GPLv2.
*/
#include <linux/init.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7f6c1a288d3..e1912ad13bdc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/swap.c
*
@@ -7,9 +8,6 @@
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
* Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
- *
- * This file is released under the GPLv2.
- *
*/
#define pr_fmt(fmt) "PM: " fmt
diff --git a/kernel/power/user.c b/kernel/power/user.c
index cb24e840a3e6..77438954cc2b 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/power/user.c
*
* This file provides the user space interface for software suspend/resume.
*
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- *
- * This file is released under the GPLv2.
- *
*/
#include <linux/suspend.h>
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4a2ffc39eb95..4d052fc6bcde 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 0f1898820cba..c8e6ab689d42 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -1,18 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* internal.h - printk internal definitions
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/percpu.h>
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a6e06fe38e41..1888f6a3b694 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/printk.c
*
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 0913b4d385de..b4045e782743 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* printk_safe.c - Safe printk for printk-deadlock-prone contexts
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/preempt.h>
diff --git a/kernel/profile.c b/kernel/profile.c
index 9c08a2c7cb1d..af7c94bf5fa1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/profile.c
* Simple profiling. Manages a direct-mapped profile hit count buffer,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6f357f4fc859..83a531cea2f3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/ptrace.c
*
@@ -78,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
*/
static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- rcu_read_lock();
- __ptrace_link(child, new_parent, __task_cred(new_parent));
- rcu_read_unlock();
+ __ptrace_link(child, new_parent, current_cred());
}
/**
@@ -117,6 +116,9 @@ void __ptrace_unlink(struct task_struct *child)
BUG_ON(!child->ptrace);
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+#endif
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
@@ -323,6 +325,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return -EPERM;
ok:
rcu_read_unlock();
+ /*
+ * If a task drops privileges and becomes nondumpable (through a syscall
+ * like setresuid()) while we are trying to access it, we must ensure
+ * that the dumpability is read after the credentials; otherwise,
+ * we may be able to attach to a task that we shouldn't be able to
+ * attach to (as if the task had dropped privileges without becoming
+ * nondumpable).
+ * Pairs with a write barrier in commit_creds().
+ */
+ smp_rmb();
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
@@ -704,6 +716,10 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (arg.nr < 0)
return -EINVAL;
+ /* Ensure arg.off fits in an unsigned long */
+ if (arg.off > ULONG_MAX)
+ return 0;
+
if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
pending = &child->signal->shared_pending;
else
@@ -711,18 +727,20 @@ static int ptrace_peek_siginfo(struct task_struct *child,
for (i = 0; i < arg.nr; ) {
kernel_siginfo_t info;
- s32 off = arg.off + i;
+ unsigned long off = arg.off + i;
+ bool found = false;
spin_lock_irq(&child->sighand->siglock);
list_for_each_entry(q, &pending->list, list) {
if (!off--) {
+ found = true;
copy_siginfo(&info, &q->info);
break;
}
}
spin_unlock_irq(&child->sighand->siglock);
- if (off >= 0) /* beyond the end of the list */
+ if (!found) /* beyond the end of the list */
break;
#ifdef CONFIG_COMPAT
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 37301430970e..480edf328b51 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# RCU-related configuration options
#
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 0ec7d1d33a14..5ec3ea4028e2 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# RCU-related debugging configuration options
#
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 390aab20115e..5290b01de534 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -446,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t);
enum rcutorture_type {
RCU_FLAVOR,
RCU_TASKS_FLAVOR,
+ RCU_TRIVIAL_FLAVOR,
SRCU_FLAVOR,
INVALID_RCU_FLAVOR
};
@@ -479,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
#endif
#endif
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+#endif
+
#ifdef CONFIG_TINY_SRCU
static inline void srcutorture_get_gp_data(enum rcutorture_type test_type,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efaa5b3f4d3f..fce4e7e6f502 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -299,6 +299,7 @@ struct rcu_torture_ops {
int irq_capable;
int can_boost;
int extendables;
+ int slow_gps;
const char *name;
};
@@ -667,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
+ .slow_gps = 1,
.name = "tasks"
};
+/*
+ * Definitions for trivial CONFIG_PREEMPT=n-only torture testing.
+ * This implementation does not necessarily work well with CPU hotplug.
+ */
+
+static void synchronize_rcu_trivial(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ WARN_ON_ONCE(raw_smp_processor_id() != cpu);
+ }
+}
+
+static int rcu_torture_read_lock_trivial(void) __acquires(RCU)
+{
+ preempt_disable();
+ return 0;
+}
+
+static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU)
+{
+ preempt_enable();
+}
+
+static struct rcu_torture_ops trivial_ops = {
+ .ttype = RCU_TRIVIAL_FLAVOR,
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock_trivial,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock_trivial,
+ .get_gp_seq = rcu_no_completed,
+ .sync = synchronize_rcu_trivial,
+ .exp_sync = synchronize_rcu_trivial,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "trivial"
+};
+
static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old)
{
if (!cur_ops->gp_diff)
@@ -1010,10 +1053,17 @@ rcu_torture_writer(void *arg)
!rcu_gp_is_normal();
}
rcu_torture_writer_state = RTWS_STUTTER;
- if (stutter_wait("rcu_torture_writer"))
+ if (stutter_wait("rcu_torture_writer") &&
+ !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ !cur_ops->slow_gps &&
+ !torture_must_stop())
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
- if (list_empty(&rcu_tortures[i].rtort_free))
- WARN_ON_ONCE(1);
+ if (list_empty(&rcu_tortures[i].rtort_free) &&
+ rcu_access_pointer(rcu_torture_current) !=
+ &rcu_tortures[i]) {
+ rcu_ftrace_dump(DUMP_ALL);
+ WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
+ }
} while (!torture_must_stop());
/* Reset expediting back to unexpedited. */
if (expediting > 0)
@@ -1358,8 +1408,9 @@ rcu_torture_stats_print(void)
}
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
- pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+ pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
rcu_torture_current,
+ rcu_torture_current ? "ver" : "VER",
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc),
@@ -1661,6 +1712,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
}
+// Give the scheduler a chance, even on nohz_full CPUs.
+static void rcu_torture_fwd_prog_cond_resched(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ if (need_resched())
+ schedule();
+ } else {
+ cond_resched();
+ }
+}
+
/*
* Free all callbacks on the rcu_fwd_cb_head list, either because the
* test is over or because we hit an OOM event.
@@ -1674,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void)
for (;;) {
spin_lock_irqsave(&rcu_fwd_lock, flags);
rfcp = rcu_fwd_cb_head;
- if (!rfcp)
+ if (!rfcp) {
+ spin_unlock_irqrestore(&rcu_fwd_lock, flags);
break;
+ }
rcu_fwd_cb_head = rfcp->rfc_next;
if (!rcu_fwd_cb_head)
rcu_fwd_cb_tail = &rcu_fwd_cb_head;
spin_unlock_irqrestore(&rcu_fwd_lock, flags);
kfree(rfcp);
freed++;
+ rcu_torture_fwd_prog_cond_resched();
}
- spin_unlock_irqrestore(&rcu_fwd_lock, flags);
return freed;
}
@@ -1707,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
}
/* Tight loop containing cond_resched(). */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb);
@@ -1724,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
udelay(10);
cur_ops->readunlock(idx);
if (!fwd_progress_need_resched || need_resched())
- cond_resched();
+ rcu_torture_fwd_prog_cond_resched();
}
(*tested_tries)++;
if (!time_before(jiffies, stopat) &&
@@ -1745,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries)
WARN_ON(READ_ONCE(fcs.stop) != 2);
destroy_rcu_head_on_stack(&fcs.rh);
}
+ schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -1765,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void)
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
+ if (!cur_ops->call)
+ return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
WRITE_ONCE(rcu_fwd_cb_nodelay, true);
@@ -1805,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void)
rfcp->rfc_gps = 0;
}
cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr);
- cond_resched();
+ rcu_torture_fwd_prog_cond_resched();
}
stoppedat = jiffies;
n_launders_cb_snap = READ_ONCE(n_launders_cb);
@@ -1814,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void)
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree();
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) {
WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED);
pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n",
@@ -1825,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void)
n_max_gps, n_max_cbs, cver, gps);
rcu_torture_fwd_cb_hist();
}
+ schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
+ WRITE_ONCE(rcu_fwd_cb_nodelay, false);
}
@@ -2240,7 +2311,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &busted_srcud_ops, &tasks_ops,
+ &busted_srcud_ops, &tasks_ops, &trivial_ops,
};
if (!torture_init_begin(torture_type, verbose))
@@ -2363,7 +2434,10 @@ rcu_torture_init(void)
if (stutter < 0)
stutter = 0;
if (stutter) {
- firsterr = torture_stutter_init(stutter * HZ);
+ int t;
+
+ t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ;
+ firsterr = torture_stutter_init(stutter * HZ, t);
if (firsterr)
goto unwind;
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 9b761e546de8..cf0e886314f2 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp)
* srcu_read_lock(), and srcu_read_unlock() that are all passed the same
* srcu_struct structure.
*/
-void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
- rcu_callback_t func, bool do_norm)
+static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+ rcu_callback_t func, bool do_norm)
{
unsigned long flags;
int idx;
@@ -1310,3 +1310,68 @@ void __init srcu_init(void)
queue_work(rcu_gp_wq, &ssp->work.work);
}
}
+
+#ifdef CONFIG_MODULES
+
+/* Initialize any global-scope srcu_struct structures used by this module. */
+static int srcu_module_coming(struct module *mod)
+{
+ int i;
+ struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+ int ret;
+
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ret = init_srcu_struct(*(sspp++));
+ if (WARN_ON_ONCE(ret))
+ return ret;
+ }
+ return 0;
+}
+
+/* Clean up any global-scope srcu_struct structures used by this module. */
+static void srcu_module_going(struct module *mod)
+{
+ int i;
+ struct srcu_struct **sspp = mod->srcu_struct_ptrs;
+
+ for (i = 0; i < mod->num_srcu_structs; i++)
+ cleanup_srcu_struct(*(sspp++));
+}
+
+/* Handle one module, either coming or going. */
+static int srcu_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+ int ret = 0;
+
+ switch (val) {
+ case MODULE_STATE_COMING:
+ ret = srcu_module_coming(mod);
+ break;
+ case MODULE_STATE_GOING:
+ srcu_module_going(mod);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static struct notifier_block srcu_module_nb = {
+ .notifier_call = srcu_module_notify,
+ .priority = 0,
+};
+
+static __init int init_srcu_module_notifier(void)
+{
+ int ret;
+
+ ret = register_module_notifier(&srcu_module_nb);
+ if (ret)
+ pr_warn("Failed to register srcu module notifier\n");
+ return ret;
+}
+late_initcall(init_srcu_module_notifier);
+
+#endif /* #ifdef CONFIG_MODULES */
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index a8304d90573f..d4558ab7a07d 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,65 +10,18 @@
#include <linux/rcu_sync.h>
#include <linux/sched.h>
-#ifdef CONFIG_PROVE_RCU
-#define __INIT_HELD(func) .held = func,
-#else
-#define __INIT_HELD(func)
-#endif
-
-static const struct {
- void (*sync)(void);
- void (*call)(struct rcu_head *, void (*)(struct rcu_head *));
- void (*wait)(void);
-#ifdef CONFIG_PROVE_RCU
- int (*held)(void);
-#endif
-} gp_ops[] = {
- [RCU_SYNC] = {
- .sync = synchronize_rcu,
- .call = call_rcu,
- .wait = rcu_barrier,
- __INIT_HELD(rcu_read_lock_held)
- },
- [RCU_SCHED_SYNC] = {
- .sync = synchronize_rcu,
- .call = call_rcu,
- .wait = rcu_barrier,
- __INIT_HELD(rcu_read_lock_sched_held)
- },
- [RCU_BH_SYNC] = {
- .sync = synchronize_rcu,
- .call = call_rcu,
- .wait = rcu_barrier,
- __INIT_HELD(rcu_read_lock_bh_held)
- },
-};
-
-enum { GP_IDLE = 0, GP_PENDING, GP_PASSED };
-enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY };
+enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
#define rss_lock gp_wait.lock
-#ifdef CONFIG_PROVE_RCU
-void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
-{
- RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
- "suspicious rcu_sync_is_idle() usage");
-}
-
-EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
-#endif
-
/**
* rcu_sync_init() - Initialize an rcu_sync structure
* @rsp: Pointer to rcu_sync structure to be initialized
- * @type: Flavor of RCU with which to synchronize rcu_sync structure
*/
-void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
+void rcu_sync_init(struct rcu_sync *rsp)
{
memset(rsp, 0, sizeof(*rsp));
init_waitqueue_head(&rsp->gp_wait);
- rsp->gp_type = type;
}
/**
@@ -86,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp)
rsp->gp_state = GP_PASSED;
}
-/**
- * rcu_sync_enter() - Force readers onto slowpath
- * @rsp: Pointer to rcu_sync structure to use for synchronization
- *
- * This function is used by updaters who need readers to make use of
- * a slowpath during the update. After this function returns, all
- * subsequent calls to rcu_sync_is_idle() will return false, which
- * tells readers to stay off their fastpaths. A later call to
- * rcu_sync_exit() re-enables reader slowpaths.
- *
- * When called in isolation, rcu_sync_enter() must wait for a grace
- * period, however, closely spaced calls to rcu_sync_enter() can
- * optimize away the grace-period wait via a state machine implemented
- * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
- */
-void rcu_sync_enter(struct rcu_sync *rsp)
-{
- bool need_wait, need_sync;
- spin_lock_irq(&rsp->rss_lock);
- need_wait = rsp->gp_count++;
- need_sync = rsp->gp_state == GP_IDLE;
- if (need_sync)
- rsp->gp_state = GP_PENDING;
- spin_unlock_irq(&rsp->rss_lock);
+static void rcu_sync_func(struct rcu_head *rhp);
- WARN_ON_ONCE(need_wait && need_sync);
- if (need_sync) {
- gp_ops[rsp->gp_type].sync();
- rsp->gp_state = GP_PASSED;
- wake_up_all(&rsp->gp_wait);
- } else if (need_wait) {
- wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED);
- } else {
- /*
- * Possible when there's a pending CB from a rcu_sync_exit().
- * Nobody has yet been allowed the 'fast' path and thus we can
- * avoid doing any sync(). The callback will get 'dropped'.
- */
- WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
- }
+static void rcu_sync_call(struct rcu_sync *rsp)
+{
+ call_rcu(&rsp->cb_head, rcu_sync_func);
}
/**
* rcu_sync_func() - Callback function managing reader access to fastpath
* @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization
*
- * This function is passed to one of the call_rcu() functions by
+ * This function is passed to call_rcu() function by rcu_sync_enter() and
* rcu_sync_exit(), so that it is invoked after a grace period following the
- * that invocation of rcu_sync_exit(). It takes action based on events that
+ * that invocation of enter/exit.
+ *
+ * If it is called by rcu_sync_enter() it signals that all the readers were
+ * switched onto slow path.
+ *
+ * If it is called by rcu_sync_exit() it takes action based on events that
* have taken place in the meantime, so that closely spaced rcu_sync_enter()
* and rcu_sync_exit() pairs need not wait for a grace period.
*
@@ -152,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp)
struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head);
unsigned long flags;
- WARN_ON_ONCE(rsp->gp_state != GP_PASSED);
- WARN_ON_ONCE(rsp->cb_state == CB_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irqsave(&rsp->rss_lock, flags);
if (rsp->gp_count) {
/*
- * A new rcu_sync_begin() has happened; drop the callback.
+ * We're at least a GP after the GP_IDLE->GP_ENTER transition.
*/
- rsp->cb_state = CB_IDLE;
- } else if (rsp->cb_state == CB_REPLAY) {
+ WRITE_ONCE(rsp->gp_state, GP_PASSED);
+ wake_up_locked(&rsp->gp_wait);
+ } else if (rsp->gp_state == GP_REPLAY) {
/*
- * A new rcu_sync_exit() has happened; requeue the callback
- * to catch a later GP.
+ * A new rcu_sync_exit() has happened; requeue the callback to
+ * catch a later GP.
*/
- rsp->cb_state = CB_PENDING;
- gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ rcu_sync_call(rsp);
} else {
/*
- * We're at least a GP after rcu_sync_exit(); eveybody will now
- * have observed the write side critical section. Let 'em rip!.
+ * We're at least a GP after the last rcu_sync_exit(); eveybody
+ * will now have observed the write side critical section.
+ * Let 'em rip!.
*/
- rsp->cb_state = CB_IDLE;
- rsp->gp_state = GP_IDLE;
+ WRITE_ONCE(rsp->gp_state, GP_IDLE);
}
spin_unlock_irqrestore(&rsp->rss_lock, flags);
}
/**
- * rcu_sync_exit() - Allow readers back onto fast patch after grace period
+ * rcu_sync_enter() - Force readers onto slowpath
+ * @rsp: Pointer to rcu_sync structure to use for synchronization
+ *
+ * This function is used by updaters who need readers to make use of
+ * a slowpath during the update. After this function returns, all
+ * subsequent calls to rcu_sync_is_idle() will return false, which
+ * tells readers to stay off their fastpaths. A later call to
+ * rcu_sync_exit() re-enables reader slowpaths.
+ *
+ * When called in isolation, rcu_sync_enter() must wait for a grace
+ * period, however, closely spaced calls to rcu_sync_enter() can
+ * optimize away the grace-period wait via a state machine implemented
+ * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func().
+ */
+void rcu_sync_enter(struct rcu_sync *rsp)
+{
+ int gp_state;
+
+ spin_lock_irq(&rsp->rss_lock);
+ gp_state = rsp->gp_state;
+ if (gp_state == GP_IDLE) {
+ WRITE_ONCE(rsp->gp_state, GP_ENTER);
+ WARN_ON_ONCE(rsp->gp_count);
+ /*
+ * Note that we could simply do rcu_sync_call(rsp) here and
+ * avoid the "if (gp_state == GP_IDLE)" block below.
+ *
+ * However, synchronize_rcu() can be faster if rcu_expedited
+ * or rcu_blocking_is_gp() is true.
+ *
+ * Another reason is that we can't wait for rcu callback if
+ * we are called at early boot time but this shouldn't happen.
+ */
+ }
+ rsp->gp_count++;
+ spin_unlock_irq(&rsp->rss_lock);
+
+ if (gp_state == GP_IDLE) {
+ /*
+ * See the comment above, this simply does the "synchronous"
+ * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED.
+ */
+ synchronize_rcu();
+ rcu_sync_func(&rsp->cb_head);
+ /* Not really needed, wait_event() would see GP_PASSED. */
+ return;
+ }
+
+ wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED);
+}
+
+/**
+ * rcu_sync_exit() - Allow readers back onto fast path after grace period
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
* This function is used by updaters who have completed, and can therefore
@@ -191,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
+
spin_lock_irq(&rsp->rss_lock);
if (!--rsp->gp_count) {
- if (rsp->cb_state == CB_IDLE) {
- rsp->cb_state = CB_PENDING;
- gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func);
- } else if (rsp->cb_state == CB_PENDING) {
- rsp->cb_state = CB_REPLAY;
+ if (rsp->gp_state == GP_PASSED) {
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ rcu_sync_call(rsp);
+ } else if (rsp->gp_state == GP_EXIT) {
+ WRITE_ONCE(rsp->gp_state, GP_REPLAY);
}
}
spin_unlock_irq(&rsp->rss_lock);
@@ -209,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp)
*/
void rcu_sync_dtor(struct rcu_sync *rsp)
{
- int cb_state;
+ int gp_state;
- WARN_ON_ONCE(rsp->gp_count);
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
+ WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irq(&rsp->rss_lock);
- if (rsp->cb_state == CB_REPLAY)
- rsp->cb_state = CB_PENDING;
- cb_state = rsp->cb_state;
+ if (rsp->gp_state == GP_REPLAY)
+ WRITE_ONCE(rsp->gp_state, GP_EXIT);
+ gp_state = rsp->gp_state;
spin_unlock_irq(&rsp->rss_lock);
- if (cb_state != CB_IDLE) {
- gp_ops[rsp->gp_type].wait();
- WARN_ON_ONCE(rsp->cb_state != CB_IDLE);
+ if (gp_state != GP_IDLE) {
+ rcu_barrier();
+ WARN_ON_ONCE(rsp->gp_state != GP_IDLE);
}
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 980ca3ca643f..a14e5fbbea46 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -51,6 +51,12 @@
#include <linux/tick.h>
#include <linux/sysrq.h>
#include <linux/kprobes.h>
+#include <linux/gfp.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
+#include <linux/jiffies.h>
+#include <linux/sched/isolation.h>
+#include "../time/tick-internal.h"
#include "tree.h"
#include "rcu.h"
@@ -92,6 +98,9 @@ struct rcu_state rcu_state = {
/* Dump rcu_node combining tree at boot to verify correct setup. */
static bool dump_tree;
module_param(dump_tree, bool, 0444);
+/* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */
+static bool use_softirq = 1;
+module_param(use_softirq, bool, 0444);
/* Control rcu_node-tree auto-balancing at boot time. */
static bool rcu_fanout_exact;
module_param(rcu_fanout_exact, bool, 0444);
@@ -138,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
-static void invoke_rcu_callbacks(struct rcu_data *rdp);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
@@ -368,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void)
}
/**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
+ * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
*
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the current CPU is idle and running at a first-level (not nested)
* interrupt from idle, return true. The caller must have at least
* disabled preemption.
*/
static int rcu_is_cpu_rrupt_from_idle(void)
{
- return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
- __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
+ /* Called only from within the scheduling-clock interrupt */
+ lockdep_assert_in_irq();
+
+ /* Check for counter underflows */
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+ "RCU dynticks_nesting counter underflow!");
+ RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+ "RCU dynticks_nmi_nesting counter underflow/zero!");
+
+ /* Are we at first interrupt nesting level? */
+ if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+ return false;
+
+ /* Does CPU appear to be idle from an RCU standpoint? */
+ return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
}
-#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
+#define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */
+#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
static long blimit = DEFAULT_RCU_BLIMIT;
#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
static long qhimark = DEFAULT_RCU_QHIMARK;
@@ -2113,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
- if (rdp->blimit == LONG_MAX && count <= qlowmark)
+ if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark)
rdp->blimit = blimit;
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
@@ -2253,7 +2275,7 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/* Perform RCU core processing work for the current CPU. */
-static __latent_entropy void rcu_core(struct softirq_action *unused)
+static __latent_entropy void rcu_core(void)
{
unsigned long flags;
struct rcu_data *rdp = raw_cpu_ptr(&rcu_data);
@@ -2287,37 +2309,126 @@ static __latent_entropy void rcu_core(struct softirq_action *unused)
rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());
/* If there are callbacks ready, invoke them. */
- if (rcu_segcblist_ready_cbs(&rdp->cblist))
- invoke_rcu_callbacks(rdp);
+ if (rcu_segcblist_ready_cbs(&rdp->cblist) &&
+ likely(READ_ONCE(rcu_scheduler_fully_active)))
+ rcu_do_batch(rdp);
/* Do any needed deferred wakeups of rcuo kthreads. */
do_nocb_deferred_wakeup(rdp);
trace_rcu_utilization(TPS("End RCU core"));
}
+static void rcu_core_si(struct softirq_action *h)
+{
+ rcu_core();
+}
+
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current)))
+ wake_up_process(t);
+}
+
+static void invoke_rcu_core_kthread(void)
+{
+ struct task_struct *t;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
+ t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task);
+ if (t != NULL && t != current)
+ rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
+ local_irq_restore(flags);
+}
+
/*
- * Schedule RCU callback invocation. If the running implementation of RCU
- * does not support RCU priority boosting, just do a direct call, otherwise
- * wake up the per-CPU kernel kthread. Note that because we are running
- * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
+ * Wake up this CPU's rcuc kthread to do RCU core processing.
*/
-static void invoke_rcu_callbacks(struct rcu_data *rdp)
+static void invoke_rcu_core(void)
{
- if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
- return;
- if (likely(!rcu_state.boost)) {
- rcu_do_batch(rdp);
+ if (!cpu_online(smp_processor_id()))
return;
+ if (use_softirq)
+ raise_softirq(RCU_SOFTIRQ);
+ else
+ invoke_rcu_core_kthread();
+}
+
+static void rcu_cpu_kthread_park(unsigned int cpu)
+{
+ per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+}
+
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(rcu_data.rcu_cpu_has_work);
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces
+ * the RCU softirq used in configurations of RCU that do not support RCU
+ * priority boosting.
+ */
+static void rcu_cpu_kthread(unsigned int cpu)
+{
+ unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
+ char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ int spincnt;
+
+ for (spincnt = 0; spincnt < 10; spincnt++) {
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
+ local_bh_disable();
+ *statusp = RCU_KTHREAD_RUNNING;
+ local_irq_disable();
+ work = *workp;
+ *workp = 0;
+ local_irq_enable();
+ if (work)
+ rcu_core();
+ local_bh_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
+ }
}
- invoke_rcu_callbacks_kthread();
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
+ *statusp = RCU_KTHREAD_WAITING;
}
-static void invoke_rcu_core(void)
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_data.rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
+
+/*
+ * Spawn per-CPU RCU core processing kthreads.
+ */
+static int __init rcu_spawn_core_kthreads(void)
{
- if (cpu_online(smp_processor_id()))
- raise_softirq(RCU_SOFTIRQ);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
+ if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ return 0;
+ WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
+ "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
+ return 0;
}
+early_initcall(rcu_spawn_core_kthreads);
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
@@ -2354,7 +2465,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
rcu_accelerate_cbs_unlocked(rdp->mynode, rdp);
} else {
/* Give the grace period a kick. */
- rdp->blimit = LONG_MAX;
+ rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
@@ -3355,7 +3466,8 @@ void __init rcu_init(void)
rcu_init_one();
if (dump_tree)
rcu_dump_rcu_node_tree();
- open_softirq(RCU_SOFTIRQ, rcu_core);
+ if (use_softirq)
+ open_softirq(RCU_SOFTIRQ, rcu_core_si);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index e253d11af3c4..7acaf3a62d39 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -154,13 +154,15 @@ struct rcu_data {
bool core_needs_qs; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
bool gpwrap; /* Possible ->gp_seq wrap. */
- bool deferred_qs; /* This CPU awaiting a deferred QS? */
+ bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
unsigned long ticks_this_gp; /* The number of scheduling-clock */
/* ticks this CPU has handled */
/* during and after the last grace */
/* period it is aware of. */
+ struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
+ bool defer_qs_iw_pending; /* Scheduler attention pending? */
/* 2) batch handling */
struct rcu_segcblist cblist; /* Segmented callback list, with */
@@ -407,8 +409,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func);
static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
-static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
+static void rcu_cpu_kthread_setup(unsigned int cpu);
static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
static void rcu_cleanup_after_idle(void);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 9c990df880d1..af7e7b9c86af 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
- WRITE_ONCE(rdp->deferred_qs, false);
+ WRITE_ONCE(rdp->exp_deferred_qs, false);
rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
}
@@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s)
{
if (rcu_exp_gp_seq_done(s)) {
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
- /* Ensure test happens before caller kfree(). */
- smp_mb__before_atomic(); /* ^^^ */
+ smp_mb(); /* Ensure test happens before caller kfree(). */
return true;
}
return false;
@@ -384,7 +383,12 @@ retry_ipi:
mask_ofl_test |= mask;
continue;
}
+ if (get_cpu() == cpu) {
+ put_cpu();
+ continue;
+ }
ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+ put_cpu();
if (!ret) {
mask_ofl_ipi &= ~mask;
continue;
@@ -611,7 +615,7 @@ static void rcu_exp_handler(void *unused)
rcu_dynticks_curr_cpu_in_eqs()) {
rcu_report_exp_rdp(rdp);
} else {
- rdp->deferred_qs = true;
+ rdp->exp_deferred_qs = true;
set_tsk_need_resched(t);
set_preempt_need_resched();
}
@@ -633,7 +637,7 @@ static void rcu_exp_handler(void *unused)
if (t->rcu_read_lock_nesting > 0) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmask & rdp->grpmask) {
- rdp->deferred_qs = true;
+ rdp->exp_deferred_qs = true;
t->rcu_read_unlock_special.b.exp_hint = true;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -656,7 +660,7 @@ static void rcu_exp_handler(void *unused)
*
* Otherwise, force a context switch after the CPU enables everything.
*/
- rdp->deferred_qs = true;
+ rdp->exp_deferred_qs = true;
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) {
rcu_preempt_deferred_qs(t);
@@ -694,6 +698,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
#else /* #ifdef CONFIG_PREEMPT_RCU */
+/* Request an expedited quiescent state. */
+static void rcu_exp_need_qs(void)
+{
+ __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
/* Invoked on each online non-idle CPU for expedited quiescent state. */
static void rcu_exp_handler(void *unused)
{
@@ -709,25 +723,38 @@ static void rcu_exp_handler(void *unused)
rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
return;
}
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
- /* Store .exp before .rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
- set_tsk_need_resched(current);
- set_preempt_need_resched();
+ rcu_exp_need_qs();
}
/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */
static void sync_sched_exp_online_cleanup(int cpu)
{
+ unsigned long flags;
+ int my_cpu;
struct rcu_data *rdp;
int ret;
struct rcu_node *rnp;
rdp = per_cpu_ptr(&rcu_data, cpu);
rnp = rdp->mynode;
- if (!(READ_ONCE(rnp->expmask) & rdp->grpmask))
+ my_cpu = get_cpu();
+ /* Quiescent state either not needed or already requested, leave. */
+ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) {
+ put_cpu();
+ return;
+ }
+ /* Quiescent state needed on current CPU, so set it up locally. */
+ if (my_cpu == cpu) {
+ local_irq_save(flags);
+ rcu_exp_need_qs();
+ local_irq_restore(flags);
+ put_cpu();
return;
+ }
+ /* Quiescent state needed on some other CPU, send IPI. */
ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0);
+ put_cpu();
WARN_ON_ONCE(ret);
}
@@ -765,7 +792,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
void synchronize_rcu_expedited(void)
{
- struct rcu_data *rdp;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -802,7 +828,6 @@ void synchronize_rcu_expedited(void)
}
/* Wait for expedited grace period to complete. */
- rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id());
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 1102765f91fd..acb225023ed1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -11,29 +11,7 @@
* Paul E. McKenney <paulmck@linux.ibm.com>
*/
-#include <linux/delay.h>
-#include <linux/gfp.h>
-#include <linux/oom.h>
-#include <linux/sched/debug.h>
-#include <linux/smpboot.h>
-#include <linux/sched/isolation.h>
-#include <uapi/linux/sched/types.h>
-#include "../time/tick-internal.h"
-
-#ifdef CONFIG_RCU_BOOST
#include "../locking/rtmutex_common.h"
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST,
- * all uses are in dead code. Provide a definition to keep the compiler
- * happy, but add WARN_ON_ONCE() to complain if used in the wrong place.
- * This probably needs to be excluded from -rt builds.
- */
-#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
-#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
-
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (!use_softirq)
+ pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
pr_info("\tRCU debug extended QS entry/exit.\n");
rcupdate_announce_bootup_oddness();
@@ -257,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
* no need to check for a subsequent expedited GP. (Though we are
* still in a quiescent state in any case.)
*/
- if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs)
+ if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
else
- WARN_ON_ONCE(rdp->deferred_qs);
+ WARN_ON_ONCE(rdp->exp_deferred_qs);
}
/*
@@ -357,7 +337,7 @@ void rcu_note_context_switch(bool preempt)
* means that we continue to block the current grace period.
*/
rcu_qs();
- if (rdp->deferred_qs)
+ if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
barrier(); /* Avoid RCU read-side critical sections leaking up. */
@@ -471,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
special = t->rcu_read_unlock_special;
rdp = this_cpu_ptr(&rcu_data);
- if (!special.s && !rdp->deferred_qs) {
+ if (!special.s && !rdp->exp_deferred_qs) {
local_irq_restore(flags);
return;
}
+ t->rcu_read_unlock_special.b.deferred_qs = false;
if (special.b.need_qs) {
rcu_qs();
t->rcu_read_unlock_special.b.need_qs = false;
- if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) {
+ if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) {
local_irq_restore(flags);
return;
}
@@ -490,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
* tasks are handled when removing the task from the
* blocked-tasks list below.
*/
- if (rdp->deferred_qs) {
+ if (rdp->exp_deferred_qs) {
rcu_report_exp_rdp(rdp);
if (!t->rcu_read_unlock_special.s) {
local_irq_restore(flags);
@@ -579,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
*/
static bool rcu_preempt_need_deferred_qs(struct task_struct *t)
{
- return (__this_cpu_read(rcu_data.deferred_qs) ||
+ return (__this_cpu_read(rcu_data.exp_deferred_qs) ||
READ_ONCE(t->rcu_read_unlock_special.s)) &&
t->rcu_read_lock_nesting <= 0;
}
@@ -607,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t)
}
/*
+ * Minimal handler to give the scheduler a chance to re-evaluate.
+ */
+static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+
+ rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
+ rdp->defer_qs_iw_pending = false;
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
@@ -625,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t)
local_irq_save(flags);
irqs_were_disabled = irqs_disabled_flags(flags);
if (preempt_bh_were_disabled || irqs_were_disabled) {
- WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false);
- /* Need to defer quiescent state until everything is enabled. */
- if (irqs_were_disabled) {
- /* Enabling irqs does not reschedule, so... */
+ bool exp;
+ struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+ struct rcu_node *rnp = rdp->mynode;
+
+ t->rcu_read_unlock_special.b.exp_hint = false;
+ exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
+ (rdp->grpmask & rnp->expmask) ||
+ tick_nohz_full_cpu(rdp->cpu);
+ // Need to defer quiescent state until everything is enabled.
+ if ((exp || in_irq()) && irqs_were_disabled && use_softirq &&
+ (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) {
+ // Using softirq, safe to awaken, and we get
+ // no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
+ } else if (exp && irqs_were_disabled && !use_softirq &&
+ !t->rcu_read_unlock_special.b.deferred_qs) {
+ // Safe to awaken and we get no help from enabling
+ // irqs, unlike bh/preempt.
+ invoke_rcu_core();
} else {
- /* Enabling BH or preempt does reschedule, so... */
+ // Enabling BH or preempt does reschedule, so...
+ // Also if no expediting or NO_HZ_FULL, slow is OK.
set_tsk_need_resched(current);
set_preempt_need_resched();
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->defer_qs_iw_pending && exp) {
+ // Get scheduler to re-evaluate and call hooks.
+ // If !IRQ_WORK, FQS scan will eventually IPI.
+ init_irq_work(&rdp->defer_qs_iw,
+ rcu_preempt_deferred_qs_handler);
+ rdp->defer_qs_iw_pending = true;
+ irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
+ }
}
+ t->rcu_read_unlock_special.b.deferred_qs = true;
local_irq_restore(flags);
return;
}
@@ -760,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
i = 0;
list_for_each(lhp, &rnp->blkd_tasks) {
pr_cont(" %p", lhp);
- if (++i >= 10)
+ if (++i >= ncheck)
break;
}
pr_cont("\n");
@@ -944,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+/*
+ * If boosting, set rcuc kthreads to realtime priority.
+ */
+static void rcu_cpu_kthread_setup(unsigned int cpu)
+{
#ifdef CONFIG_RCU_BOOST
+ struct sched_param sp;
-static void rcu_wake_cond(struct task_struct *t, int status)
-{
- /*
- * If the thread is yielding, only wake it when this
- * is invoked from idle
- */
- if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
- wake_up_process(t);
+ sp.sched_priority = kthread_prio;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+#endif /* #ifdef CONFIG_RCU_BOOST */
}
+#ifdef CONFIG_RCU_BOOST
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1091,23 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
}
/*
- * Wake up the per-CPU kthread to invoke RCU callbacks.
- */
-static void invoke_rcu_callbacks_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __this_cpu_write(rcu_data.rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) {
- rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task),
- __this_cpu_read(rcu_data.rcu_cpu_kthread_status));
- }
- local_irq_restore(flags);
-}
-
-/*
* Is the current CPU running the RCU-callbacks kthread?
* Caller must have preemption disabled.
*/
@@ -1160,59 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
return 0;
}
-static void rcu_cpu_kthread_setup(unsigned int cpu)
-{
- struct sched_param sp;
-
- sp.sched_priority = kthread_prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
-}
-
-static void rcu_cpu_kthread_park(unsigned int cpu)
-{
- per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
-}
-
-static int rcu_cpu_kthread_should_run(unsigned int cpu)
-{
- return __this_cpu_read(rcu_data.rcu_cpu_has_work);
-}
-
-/*
- * Per-CPU kernel thread that invokes RCU callbacks. This replaces
- * the RCU softirq used in configurations of RCU that do not support RCU
- * priority boosting.
- */
-static void rcu_cpu_kthread(unsigned int cpu)
-{
- unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
- char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
- int spincnt;
-
- for (spincnt = 0; spincnt < 10; spincnt++) {
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
- local_bh_disable();
- *statusp = RCU_KTHREAD_RUNNING;
- local_irq_disable();
- work = *workp;
- *workp = 0;
- local_irq_enable();
- if (work)
- rcu_do_batch(this_cpu_ptr(&rcu_data));
- local_bh_enable();
- if (*workp == 0) {
- trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
- *statusp = RCU_KTHREAD_WAITING;
- return;
- }
- }
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
- schedule_timeout_interruptible(2);
- trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
- *statusp = RCU_KTHREAD_WAITING;
-}
-
/*
* Set the per-rcu_node kthread's affinity to cover all CPUs that are
* served by the rcu_node in question. The CPU hotplug lock is still
@@ -1243,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
free_cpumask_var(cm);
}
-static struct smp_hotplug_thread rcu_cpu_thread_spec = {
- .store = &rcu_data.rcu_cpu_kthread_task,
- .thread_should_run = rcu_cpu_kthread_should_run,
- .thread_fn = rcu_cpu_kthread,
- .thread_comm = "rcuc/%u",
- .setup = rcu_cpu_kthread_setup,
- .park = rcu_cpu_kthread_park,
-};
-
/*
* Spawn boost kthreads -- called as soon as the scheduler is running.
*/
static void __init rcu_spawn_boost_kthreads(void)
{
struct rcu_node *rnp;
- int cpu;
- for_each_possible_cpu(cpu)
- per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__))
- return;
rcu_for_each_leaf_node(rnp)
(void)rcu_spawn_one_boost_kthread(rnp);
}
@@ -1286,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
-static void invoke_rcu_callbacks_kthread(void)
-{
- WARN_ON_ONCE(1);
-}
-
static bool rcu_is_callbacks_kthread(void)
{
return false;
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index f65a73a97323..065183391f75 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp,
time_before(j, rcu_state.gp_req_activity + gpssdelay) ||
time_before(j, rcu_state.gp_activity + gpssdelay) ||
atomic_xchg(&warned, 1)) {
- raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */
+ if (rnp_root != rnp)
+ /* irqs remain disabled. */
+ raw_spin_unlock_rcu_node(rnp_root);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..61df2bf08563 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
do { } while (0)
#endif
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST)
+/* Get rcutorture access to sched_setaffinity(). */
+long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ int ret;
+
+ ret = sched_setaffinity(pid, in_mask);
+ WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity);
+#endif
+
#ifdef CONFIG_RCU_STALL_COMMON
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b9e79e8c7226..c4d472b7f1b4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/reboot.c
*
diff --git a/kernel/resource.c b/kernel/resource.c
index 8c15f846e8ef..158f04ec1d4f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/resource.c
*
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
}
#endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
}
-#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e3e3b979f9bd..1152259a4ca0 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sched_clock() for unstable CPU clocks
*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 102dfcf0a29a..fa43ce3962e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/core.c
*
@@ -22,6 +23,17 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+/*
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
+ * associated with them) to allow external modules to probe them.
+ */
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
+
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -760,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
}
}
+#ifdef CONFIG_UCLAMP_TASK
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+ return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+
+static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
+{
+ return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
+}
+
+static inline unsigned int uclamp_none(int clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_se_set(struct uclamp_se *uc_se,
+ unsigned int value, bool user_defined)
+{
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
+
+static inline unsigned int
+uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+ unsigned int clamp_value)
+{
+ /*
+ * Avoid blocked utilization pushing up the frequency when we go
+ * idle (which drops the max-clamp) by retaining the last known
+ * max-clamp.
+ */
+ if (clamp_id == UCLAMP_MAX) {
+ rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
+ return clamp_value;
+ }
+
+ return uclamp_none(UCLAMP_MIN);
+}
+
+static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+ unsigned int clamp_value)
+{
+ /* Reset max-clamp retention only on idle exit */
+ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ return;
+
+ WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+}
+
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
+ unsigned int clamp_value)
+{
+ struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+ int bucket_id = UCLAMP_BUCKETS - 1;
+
+ /*
+ * Since both min and max clamps are max aggregated, find the
+ * top most bucket with tasks in.
+ */
+ for ( ; bucket_id >= 0; bucket_id--) {
+ if (!bucket[bucket_id].tasks)
+ continue;
+ return bucket[bucket_id].value;
+ }
+
+ /* No tasks -- default clamp values */
+ return uclamp_idle_value(rq, clamp_id, clamp_value);
+}
+
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+{
+ struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+ struct uclamp_se uc_max = uclamp_default[clamp_id];
+
+ /* System default restrictions always apply */
+ if (unlikely(uc_req.value > uc_max.value))
+ return uc_max;
+
+ return uc_req;
+}
+
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+{
+ struct uclamp_se uc_eff;
+
+ /* Task currently refcounted: use back-annotated (effective) value */
+ if (p->uclamp[clamp_id].active)
+ return p->uclamp[clamp_id].value;
+
+ uc_eff = uclamp_eff_get(p, clamp_id);
+
+ return uc_eff.value;
+}
+
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ *
+ * Tasks can have a task-specific value requested from user-space, track
+ * within each bucket the maximum value for tasks refcounted in it.
+ * This "local max aggregation" allows to track the exact "requested" value
+ * for each bucket when all its RUNNABLE tasks require the same clamp.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+ unsigned int clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+
+ lockdep_assert_held(&rq->lock);
+
+ /* Update task effective clamp */
+ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ bucket->tasks++;
+ uc_se->active = true;
+
+ uclamp_idle_reset(rq, clamp_id, uc_se->value);
+
+ /*
+ * Local max aggregation: rq buckets always track the max
+ * "requested" clamp value of its RUNNABLE tasks.
+ */
+ if (bucket->tasks == 1 || uc_se->value > bucket->value)
+ bucket->value = uc_se->value;
+
+ if (uc_se->value > READ_ONCE(uc_rq->value))
+ WRITE_ONCE(uc_rq->value, uc_se->value);
+}
+
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+ unsigned int clamp_id)
+{
+ struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+ struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+ struct uclamp_bucket *bucket;
+ unsigned int bkt_clamp;
+ unsigned int rq_clamp;
+
+ lockdep_assert_held(&rq->lock);
+
+ bucket = &uc_rq->bucket[uc_se->bucket_id];
+ SCHED_WARN_ON(!bucket->tasks);
+ if (likely(bucket->tasks))
+ bucket->tasks--;
+ uc_se->active = false;
+
+ /*
+ * Keep "local max aggregation" simple and accept to (possibly)
+ * overboost some RUNNABLE tasks in the same bucket.
+ * The rq clamp bucket value is reset to its base value whenever
+ * there are no more RUNNABLE tasks refcounting it.
+ */
+ if (likely(bucket->tasks))
+ return;
+
+ rq_clamp = READ_ONCE(uc_rq->value);
+ /*
+ * Defensive programming: this should never happen. If it happens,
+ * e.g. due to future modification, warn and fixup the expected value.
+ */
+ SCHED_WARN_ON(bucket->value > rq_clamp);
+ if (bucket->value >= rq_clamp) {
+ bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
+ WRITE_ONCE(uc_rq->value, bkt_clamp);
+ }
+}
+
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+ unsigned int clamp_id;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /* Reset clamp idle holding when there is one RUNNABLE task */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+ unsigned int clamp_id;
+
+ if (unlikely(!p->sched_class->uclamp_enabled))
+ return;
+
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_dec_id(rq, p, clamp_id);
+}
+
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int old_min, old_max;
+ static DEFINE_MUTEX(mutex);
+ int result;
+
+ mutex_lock(&mutex);
+ old_min = sysctl_sched_uclamp_util_min;
+ old_max = sysctl_sched_uclamp_util_max;
+
+ result = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (result)
+ goto undo;
+ if (!write)
+ goto done;
+
+ if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+ sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+ result = -EINVAL;
+ goto undo;
+ }
+
+ if (old_min != sysctl_sched_uclamp_util_min) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+ sysctl_sched_uclamp_util_min, false);
+ }
+ if (old_max != sysctl_sched_uclamp_util_max) {
+ uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+ sysctl_sched_uclamp_util_max, false);
+ }
+
+ /*
+ * Updating all the RUNNABLE task is expensive, keep it simple and do
+ * just a lazy update at each next enqueue time.
+ */
+ goto done;
+
+undo:
+ sysctl_sched_uclamp_util_min = old_min;
+ sysctl_sched_uclamp_util_max = old_max;
+done:
+ mutex_unlock(&mutex);
+
+ return result;
+}
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
+ unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+ lower_bound = attr->sched_util_min;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+ upper_bound = attr->sched_util_max;
+
+ if (lower_bound > upper_bound)
+ return -EINVAL;
+ if (upper_bound > SCHED_CAPACITY_SCALE)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ unsigned int clamp_id;
+
+ /*
+ * On scheduling class change, reset to default clamps for tasks
+ * without a task-specific value.
+ */
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int clamp_value = uclamp_none(clamp_id);
+
+ /* Keep using defined clamps across class changes */
+ if (uc_se->user_defined)
+ continue;
+
+ /* By default, RT tasks always get 100% boost */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ clamp_value = uclamp_none(UCLAMP_MAX);
+
+ uclamp_se_set(uc_se, clamp_value, false);
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+static void uclamp_fork(struct task_struct *p)
+{
+ unsigned int clamp_id;
+
+ for_each_clamp_id(clamp_id)
+ p->uclamp[clamp_id].active = false;
+
+ if (likely(!p->sched_reset_on_fork))
+ return;
+
+ for_each_clamp_id(clamp_id) {
+ unsigned int clamp_value = uclamp_none(clamp_id);
+
+ /* By default, RT tasks always get 100% boost */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ clamp_value = uclamp_none(UCLAMP_MAX);
+
+ uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+ }
+}
+
+static void __init init_uclamp(void)
+{
+ struct uclamp_se uc_max = {};
+ unsigned int clamp_id;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+ cpu_rq(cpu)->uclamp_flags = 0;
+ }
+
+ for_each_clamp_id(clamp_id) {
+ uclamp_se_set(&init_task.uclamp_req[clamp_id],
+ uclamp_none(clamp_id), false);
+ }
+
+ /* System defaults allow max clamp values for both indexes */
+ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
+ for_each_clamp_id(clamp_id)
+ uclamp_default[clamp_id] = uc_max;
+}
+
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+static inline void uclamp_fork(struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
+
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
@@ -770,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
psi_enqueue(p, flags & ENQUEUE_WAKEUP);
}
+ uclamp_rq_inc(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
@@ -783,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
psi_dequeue(p, flags & DEQUEUE_SLEEP);
}
+ uclamp_rq_dec(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -929,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p))
@@ -1024,7 +1433,7 @@ static int migration_cpu_stop(void *data)
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
sched_ttwu_pending();
@@ -1055,7 +1464,7 @@ static int migration_cpu_stop(void *data)
*/
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
{
- cpumask_copy(&p->cpus_allowed, new_mask);
+ cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
@@ -1125,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(&p->cpus_allowed, new_mask))
+ if (cpumask_equal(p->cpus_ptr, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1285,10 +1694,10 @@ static int migrate_swap_stop(void *data)
if (task_cpu(arg->src_task) != arg->src_cpu)
goto unlock;
- if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
goto unlock;
- if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
goto unlock;
__migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1330,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
- if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
goto out;
- if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+ if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
goto out;
trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1478,7 +1887,7 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
/*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
*
* A few notes on cpu_active vs cpu_online:
*
@@ -1518,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_active(dest_cpu))
continue;
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
return dest_cpu;
}
}
for (;;) {
/* Any allowed, online CPU? */
- for_each_cpu(dest_cpu, &p->cpus_allowed) {
+ for_each_cpu(dest_cpu, p->cpus_ptr) {
if (!is_cpu_allowed(p, dest_cpu))
continue;
@@ -1569,7 +1978,7 @@ out:
}
/*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
*/
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1579,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
else
- cpu = cpumask_any(&p->cpus_allowed);
+ cpu = cpumask_any(p->cpus_ptr);
/*
* In order not to call set_task_cpu() on a blocking task we need
- * to rely on ttwu() to place the task on a valid ->cpus_allowed
+ * to rely on ttwu() to place the task on a valid ->cpus_ptr
* CPU.
*
* Since this is common to all placement strategies, this lives here.
@@ -1990,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
+ if (p == current) {
+ /*
+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+ * == smp_processor_id()'. Together this means we can special
+ * case the whole 'p->on_rq && ttwu_remote()' case below
+ * without taking any locks.
+ *
+ * In particular:
+ * - we rely on Program-Order guarantees for all the ordering,
+ * - we're serialized against set_special_state() by virtue of
+ * it disabling IRQs (this allows not taking ->pi_lock).
+ */
+ if (!(p->state & state))
+ return false;
+
+ success = 1;
+ cpu = task_cpu(p);
+ trace_sched_waking(p);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+ goto out;
+ }
+
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -1999,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
- goto out;
+ goto unlock;
trace_sched_waking(p);
@@ -2029,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
- goto stat;
+ goto unlock;
#ifdef CONFIG_SMP
/*
@@ -2089,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
-stat:
- ttwu_stat(p, cpu, wake_flags);
-out:
+unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+out:
+ if (success)
+ ttwu_stat(p, cpu, wake_flags);
return success;
}
@@ -2299,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
*/
p->prio = current->normal_prio;
+ uclamp_fork(p);
+
/*
* Revert to default priority/policy on fork if requested.
*/
@@ -2394,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
- * - cpus_allowed can change in the fork path
+ * - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -3032,7 +3467,6 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
- cpu_load_update_active(rq);
calc_global_load_tick(rq);
psi_task_tick(rq);
@@ -4070,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p,
static void __setscheduler(struct rq *rq, struct task_struct *p,
const struct sched_attr *attr, bool keep_boost)
{
+ /*
+ * If params can't change scheduling class changes aren't allowed
+ * either.
+ */
+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ return;
+
__setscheduler_params(p, attr);
/*
@@ -4207,6 +4648,13 @@ recheck:
return retval;
}
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
/*
* Make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -4236,6 +4684,8 @@ recheck:
goto change;
if (dl_policy(policy) && dl_param_changed(p, attr))
goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &rf);
@@ -4266,7 +4716,7 @@ change:
* the entire root_domain to become SCHED_DEADLINE. We
* will also fail if there's no bandwidth available.
*/
- if (!cpumask_subset(span, &p->cpus_allowed) ||
+ if (!cpumask_subset(span, p->cpus_ptr) ||
rq->rd->dl_bw.bw == 0) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -4316,7 +4766,9 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
+
__setscheduler(rq, p, attr, pi);
+ __setscheduler_uclamp(p, attr);
if (queued) {
/*
@@ -4492,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
if (ret)
return -EFAULT;
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
/*
* XXX: Do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
@@ -4555,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if ((int)attr.sched_policy < 0)
return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
rcu_read_lock();
retval = -ESRCH;
p = find_process_by_pid(pid);
- if (p != NULL)
- retval = sched_setattr(p, &attr);
+ if (likely(p))
+ get_task_struct(p);
rcu_read_unlock();
+ if (likely(p)) {
+ retval = sched_setattr(p, &attr);
+ put_task_struct(p);
+ }
+
return retval;
}
@@ -4713,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
else
attr.sched_nice = task_nice(p);
+#ifdef CONFIG_UCLAMP_TASK
+ attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+
rcu_read_unlock();
retval = sched_read_attr(uattr, &attr, size);
@@ -4865,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
@@ -5122,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout)
}
EXPORT_SYMBOL(io_schedule_timeout);
-void io_schedule(void)
+void __sched io_schedule(void)
{
int token;
@@ -5442,7 +5910,7 @@ int task_can_attach(struct task_struct *p,
* allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for
* success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
+ * before cpus_mask may be changed.
*/
if (p->flags & PF_NO_SETAFFINITY) {
ret = -EINVAL;
@@ -5469,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
if (curr_cpu == target_cpu)
return 0;
- if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
return -EINVAL;
/* TODO: This is not properly updating schedstats */
@@ -5607,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
put_prev_task(rq, next);
/*
- * Rules for changing task_struct::cpus_allowed are holding
+ * Rules for changing task_struct::cpus_mask are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
@@ -5901,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
void __init sched_init(void)
{
- int i, j;
unsigned long alloc_size = 0, ptr;
+ int i;
wait_bit_init();
@@ -6004,10 +6472,6 @@ void __init sched_init(void)
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
-
- for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
- rq->cpu_load[j] = 0;
-
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -6062,6 +6526,8 @@ void __init sched_init(void)
psi_init();
+ init_uclamp();
+
scheduler_running = 1;
}
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 50316455ea66..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/cpudl.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include "sched.h"
@@ -124,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
- cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+ cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
return 1;
} else {
int best_cpu = cpudl_maximum(cp);
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
- if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
if (later_mask)
cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..636ca6f88c8e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type)
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p)
{
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
- if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
+ if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+ type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
+ }
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
+ *
+ * CFS and RT utilization can be boosted or capped, depending on
+ * utilization clamp constraints requested by currently RUNNABLE
+ * tasks.
+ * When there are no CFS RUNNABLE tasks, clamps are released and
+ * frequency will be gracefully reduced with the utilization decay.
*/
- util = util_cfs;
- util += cpu_util_rt(rq);
+ util = util_cfs + cpu_util_rt(rq);
+ if (type == FREQUENCY_UTIL)
+ util = uclamp_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
- unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+ unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
- return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
+ return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
}
/**
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index daaadf939ccb..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/cpupri.c
*
@@ -20,11 +21,6 @@
* searches). For tasks with affinity restrictions, the algorithm has a
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include "sched.h"
@@ -98,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (skip)
continue;
- if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+ if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
continue;
if (lowest_mask) {
- cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+ cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
/*
* We have to ensure that we have at least one bit
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ba4a143bdcf3..2305ce89a26c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple CPU accounting cgroup controller
*/
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..8b5bb2ac16e2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
* If we cannot preempt any rq, fall back to pick any
* online CPU:
*/
- cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+ cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
if (cpu >= nr_cpu_ids) {
/*
* Failed to find any suitable CPU.
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
&curr->dl);
} else {
unsigned long scale_freq = arch_scale_freq_capacity(cpu);
- unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+ unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
scaled_delta_exec = cap_scale(delta_exec, scale_freq);
scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
}
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
/* Retry if something changed. */
if (double_lock_balance(rq, later_rq)) {
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!dl_task(task) ||
!task_on_rq_queued(task))) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 678bfb9bd87f..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/sched/debug.c
*
* Print the CFS rbtree and other debugging details
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include "sched.h"
@@ -236,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
-
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler,
- bool load_idx)
+ umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
-
- if (load_idx) {
- entry->extra1 = &min_load_idx;
- entry->extra2 = &max_load_idx;
- }
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table = sd_alloc_ctl_entry(9);
if (table == NULL)
return NULL;
- set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
- set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
- set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
+ /* &table[8] is terminator */
return table;
}
@@ -656,8 +639,6 @@ do { \
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
- SEQ_printf(m, " .%-30s: %lu\n", "load",
- rq->load.weight);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
@@ -665,11 +646,6 @@ do { \
SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
PN(clock);
PN(clock_task);
- P(cpu_load[0]);
- P(cpu_load[1]);
- P(cpu_load[2]);
- P(cpu_load[3]);
- P(cpu_load[4]);
#undef P
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..036be95a87e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+ if (!path)
+ return;
+
+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+ autogroup_path(cfs_rq->tg, path, len);
+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
+ else
+ strlcpy(path, "(null)", len);
+}
+
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return NULL;
}
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+ if (path)
+ strlcpy(path, "(null)", len);
+}
+
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p)
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
@@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(struct rq *rq);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_runnable_load(struct rq *rq);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->load += weighted_cpuload(rq);
+ ns->load += cpu_runnable_load(rq);
ns->compute_capacity += capacity_of(cpu);
}
@@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
/* Skip this swap candidate if cannot move to the source cpu */
- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
goto unlock;
/*
@@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
/* Skip this CPU if the source task cannot migrate */
- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
env->dst_cpu = cpu;
@@ -2686,8 +2703,6 @@ static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
struct rq *rq = rq_of(cfs_rq);
@@ -2703,8 +2718,6 @@ static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
- if (!parent_entity(se))
- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
#ifdef CONFIG_SMP
if (entity_is_task(se)) {
account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
update_tg_cfs_util(cfs_rq, se, gcfs_rq);
update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+ trace_pelt_cfs_tp(cfs_rq);
+ trace_pelt_se_tp(se);
+
return 1;
}
@@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
cfs_rq_util_change(cfs_rq, flags);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
/**
@@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
cfs_rq_util_change(cfs_rq, 0);
+
+ trace_pelt_cfs_tp(cfs_rq);
}
/*
@@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+ if (schedstat_enabled() &&
+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
schedstat_set(se->statistics.slice_max,
max((u64)schedstat_val(se->statistics.slice_max),
se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
if (runtime_refresh_within(cfs_b, min_left))
return;
+ /* don't push forwards an existing deferred unthrottle */
+ if (cfs_b->slack_started)
+ return;
+ cfs_b->slack_started = true;
+
hrtimer_start(&cfs_b->slack_timer,
ns_to_ktime(cfs_bandwidth_slack_period),
HRTIMER_MODE_REL);
@@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
/* confirm we're still not at a refresh boundary */
raw_spin_lock_irqsave(&cfs_b->lock, flags);
+ cfs_b->slack_started = false;
if (cfs_b->distribute_running) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
return;
@@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->distribute_running = 0;
+ cfs_b->slack_started = false;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu)
static inline void update_overutilized_status(struct rq *rq)
{
- if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+ }
}
#else
static inline void update_overutilized_status(struct rq *rq) { }
@@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
#ifdef CONFIG_NO_HZ_COMMON
-/*
- * per rq 'load' arrray crap; XXX kill this.
- */
-
-/*
- * The exact cpuload calculated at every tick would be:
- *
- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
- *
- * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when CPU may be busy, then we have:
- *
- * load_n = (1 - 1/2^i)^n * load_0
- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- *
- * load' = (1 - 1/2^i)^n * load
- *
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
- * This allows us to precompute the above in said factors, thereby allowing the
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
- * fixed_power_int())
- *
- * The calculation is approximated on a 128 point scale.
- */
-#define DEGRADE_SHIFT 7
-
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
- { 0, 0, 0, 0, 0, 0, 0, 0 },
- { 64, 32, 8, 0, 0, 0, 0, 0 },
- { 96, 72, 40, 12, 1, 0, 0, 0 },
- { 112, 98, 75, 43, 15, 1, 0, 0 },
- { 120, 112, 98, 76, 45, 16, 2, 0 }
-};
-
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
- int j = 0;
-
- if (!missed_updates)
- return load;
-
- if (missed_updates >= degrade_zero_ticks[idx])
- return 0;
-
- if (idx == 1)
- return load >> missed_updates;
-
- while (missed_updates) {
- if (missed_updates % 2)
- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-
- missed_updates >>= 1;
- j++;
- }
- return load;
-}
static struct {
cpumask_var_t idle_cpus_mask;
@@ -5401,234 +5366,11 @@ static struct {
#endif /* CONFIG_NO_HZ_COMMON */
-/**
- * __cpu_load_update - update the rq->cpu_load[] statistics
- * @this_rq: The rq to update statistics for
- * @this_load: The current load
- * @pending_updates: The number of missed updates
- *
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- *
- * This function computes a decaying average:
- *
- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
- *
- * Because of NOHZ it might not get called on every tick which gives need for
- * the @pending_updates argument.
- *
- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
- * = A * (A * load[i]_n-2 + B) + B
- * = A * (A * (A * load[i]_n-3 + B) + B) + B
- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
- *
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
- * any change in load would have resulted in the tick being turned back on.
- *
- * For regular NOHZ, this reduces to:
- *
- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
- *
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term.
- */
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates)
-{
- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
- int i, scale;
-
- this_rq->nr_load_updates++;
-
- /* Update our load: */
- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
- unsigned long old_load, new_load;
-
- /* scale is effectively 1 << i now, and >> i divides by scale */
-
- old_load = this_rq->cpu_load[i];
-#ifdef CONFIG_NO_HZ_COMMON
- old_load = decay_load_missed(old_load, pending_updates - 1, i);
- if (tickless_load) {
- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
- /*
- * old_load can never be a negative value because a
- * decayed tickless_load cannot be greater than the
- * original tickless_load.
- */
- old_load += tickless_load;
- }
-#endif
- new_load = this_load;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (new_load > old_load)
- new_load += scale - 1;
-
- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
- }
-}
-
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(struct rq *rq)
+static unsigned long cpu_runnable_load(struct rq *rq)
{
return cfs_rq_runnable_load_avg(&rq->cfs);
}
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we need to avoid the delta approach from the regular tick when
- * possible since that would seriously skew the load calculation. This is why we
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
- * loop exit, nohz_idle_balance, nohz full exit...)
- *
- * This means we might still be one tick off for nohz periods.
- */
-
-static void cpu_load_update_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load)
-{
- unsigned long pending_updates;
-
- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
- if (pending_updates) {
- this_rq->last_load_update_tick = curr_jiffies;
- /*
- * In the regular NOHZ case, we were idle, this means load 0.
- * In the NOHZ_FULL case, we were non-idle, we should consider
- * its weighted load.
- */
- cpu_load_update(this_rq, load, pending_updates);
- }
-}
-
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-static void cpu_load_update_idle(struct rq *this_rq)
-{
- /*
- * bail if there's load or we're actually up-to-date.
- */
- if (weighted_cpuload(this_rq))
- return;
-
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
-}
-
-/*
- * Record CPU load on nohz entry so we know the tickless load to account
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
- * than other cpu_load[idx] but it should be fine as cpu_load readers
- * shouldn't rely into synchronized cpu_load[*] updates.
- */
-void cpu_load_update_nohz_start(void)
-{
- struct rq *this_rq = this_rq();
-
- /*
- * This is all lockless but should be fine. If weighted_cpuload changes
- * concurrently we'll exit nohz. And cpu_load write can race with
- * cpu_load_update_idle() but both updater would be writing the same.
- */
- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
-}
-
-/*
- * Account the tickless load in the end of a nohz frame.
- */
-void cpu_load_update_nohz_stop(void)
-{
- unsigned long curr_jiffies = READ_ONCE(jiffies);
- struct rq *this_rq = this_rq();
- unsigned long load;
- struct rq_flags rf;
-
- if (curr_jiffies == this_rq->last_load_update_tick)
- return;
-
- load = weighted_cpuload(this_rq);
- rq_lock(this_rq, &rf);
- update_rq_clock(this_rq);
- cpu_load_update_nohz(this_rq, curr_jiffies, load);
- rq_unlock(this_rq, &rf);
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline void cpu_load_update_nohz(struct rq *this_rq,
- unsigned long curr_jiffies,
- unsigned long load) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
-{
-#ifdef CONFIG_NO_HZ_COMMON
- /* See the mess around cpu_load_update_nohz(). */
- this_rq->last_load_update_tick = READ_ONCE(jiffies);
-#endif
- cpu_load_update(this_rq, load, 1);
-}
-
-/*
- * Called from scheduler_tick()
- */
-void cpu_load_update_active(struct rq *this_rq)
-{
- unsigned long load = weighted_cpuload(this_rq);
-
- if (tick_nohz_tick_stopped())
- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
- else
- cpu_load_update_periodic(this_rq, load);
-}
-
-/*
- * Return a low guess at the load of a migration-source CPU weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(rq);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target CPU weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(rq);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return max(rq->cpu_load[type-1], total);
-}
-
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
@@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(rq);
+ unsigned long load_avg = cpu_runnable_load(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
s64 this_eff_load, prev_eff_load;
unsigned long task_load;
- this_eff_load = target_load(this_cpu, sd->wake_idx);
+ this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
if (sync) {
unsigned long current_load = task_h_load(current);
@@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load *= 100;
this_eff_load *= capacity_of(prev_cpu);
- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+ prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
prev_eff_load -= task_load;
if (sched_feat(WA_BIAS))
prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
- int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
(sd->imbalance_pct-100) / 100;
- if (sd_flag & SD_BALANCE_WAKE)
- load_idx = sd->wake_idx;
-
do {
unsigned long load, avg_load, runnable_load;
unsigned long spare_cap, max_spare_cap;
@@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_span(group),
- &p->cpus_allowed))
+ p->cpus_ptr))
continue;
local_group = cpumask_test_cpu(this_cpu,
@@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
max_spare_cap = 0;
for_each_cpu(i, sched_group_span(group)) {
- /* Bias balancing toward CPUs of our domain */
- if (local_group)
- load = source_load(i, load_idx);
- else
- load = target_load(i, load_idx);
-
+ load = cpu_runnable_load(cpu_rq(i));
runnable_load += load;
avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
return cpumask_first(sched_group_span(group));
/* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
@@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(cpu_rq(i));
+ load = cpu_runnable_load(cpu_rq(i));
if (load < min_load) {
min_load = load;
least_loaded_cpu = i;
@@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
{
int new_cpu = cpu;
- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
return prev_cpu;
/*
@@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
if (!test_idle_cores(target, false))
return -1;
- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
@@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
return cpu;
@@ -6189,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 time, cost;
s64 delta;
int cpu, nr = INT_MAX;
+ int this = smp_processor_id();
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -6212,18 +5946,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
nr = 4;
}
- time = local_clock();
+ time = cpu_clock(this);
for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!--nr)
return -1;
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
if (available_idle_cpu(cpu))
break;
}
- time = local_clock() - time;
+ time = cpu_clock(this) - time;
cost = this_sd->avg_scan_cost;
delta = (s64)(time - cost) / 8;
this_sd->avg_scan_cost += delta;
@@ -6254,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
available_idle_cpu(recent_used_cpu) &&
- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6498,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
- long util, max_util, sum_util, energy = 0;
+ unsigned int max_util, util_cfs, cpu_util, cpu_cap;
+ unsigned long sum_util, energy = 0;
+ struct task_struct *tsk;
int cpu;
for (; pd; pd = pd->next) {
+ struct cpumask *pd_mask = perf_domain_span(pd);
+
+ /*
+ * The energy model mandates all the CPUs of a performance
+ * domain have the same capacity.
+ */
+ cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
max_util = sum_util = 0;
+
/*
* The capacity state of CPUs of the current rd can be driven by
* CPUs of another rd if they belong to the same performance
@@ -6513,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
* it will not appear in its pd list and will not be accounted
* by compute_energy().
*/
- for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
- util = cpu_util_next(cpu, p, dst_cpu);
- util = schedutil_energy_util(cpu, util);
- max_util = max(util, max_util);
- sum_util += util;
+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
+ util_cfs = cpu_util_next(cpu, p, dst_cpu);
+
+ /*
+ * Busy time computation: utilization clamping is not
+ * required since the ratio (sum_util / cpu_capacity)
+ * is already enough to scale the EM reported power
+ * consumption at the (eventually clamped) cpu_capacity.
+ */
+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ ENERGY_UTIL, NULL);
+
+ /*
+ * Performance domain frequency: utilization clamping
+ * must be considered since it affects the selection
+ * of the performance domain frequency.
+ * NOTE: in case RT tasks are running, by default the
+ * FREQUENCY_UTIL's utilization can be max OPP.
+ */
+ tsk = cpu == dst_cpu ? p : NULL;
+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+ FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, cpu_util);
}
energy += em_pd_energy(pd->em_pd, max_util, sum_util);
@@ -6600,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
int max_spare_cap_cpu = -1;
for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
/* Skip CPUs that will be overutilized. */
@@ -6689,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
}
want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed);
+ cpumask_test_cpu(cpu, p->cpus_ptr);
}
rcu_read_lock();
@@ -7445,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_allowed, or
+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
int cpu;
schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7472,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
/* Prevent to re-select dst_cpu via env's CPUs: */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
@@ -7558,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
static const unsigned int sched_nr_migrate_break = 32;
/*
- * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * detach_tasks() -- tries to detach up to imbalance runnable load from
* busiest_rq, as part of a balancing operation within domain "sd".
*
* Returns number of detached tasks if successful and 0 otherwise.
@@ -7626,7 +7388,7 @@ static int detach_tasks(struct lb_env *env)
/*
* We only want to steal up to the prescribed amount of
- * weighted load.
+ * runnable load.
*/
if (env->imbalance <= 0)
break;
@@ -7695,6 +7457,7 @@ static void attach_tasks(struct lb_env *env)
rq_unlock(env->dst_rq, &rf);
}
+#ifdef CONFIG_NO_HZ_COMMON
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
{
if (cfs_rq->avg.load_avg)
@@ -7722,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
}
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+ rq->last_blocked_load_update_tick = jiffies;
+
+ if (!has_blocked)
+ rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7787,11 +7563,7 @@ static void update_blocked_averages(int cpu)
if (others_have_blocked(rq))
done = false;
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (done)
- rq->has_blocked_load = 0;
-#endif
+ update_blocked_load_status(rq, !done);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7857,11 +7629,7 @@ static inline void update_blocked_averages(int cpu)
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
- rq->last_blocked_load_update_tick = jiffies;
- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
- rq->has_blocked_load = 0;
-#endif
+ update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
rq_unlock_irqrestore(rq, &rf);
}
@@ -7879,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p)
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
unsigned long group_capacity;
unsigned long group_util; /* Total utilization of the group */
@@ -7933,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
- *
- * Return: The load index.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
- enum cpu_idle_type idle)
-{
- int load_idx;
-
- switch (idle) {
- case CPU_NOT_IDLE:
- load_idx = sd->busy_idx;
- break;
-
- case CPU_NEWLY_IDLE:
- load_idx = sd->newidle_idx;
- break;
- default:
- load_idx = sd->idle_idx;
- break;
- }
-
- return load_idx;
-}
-
static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+ unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -7989,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
unsigned long capacity = scale_rt_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
- cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
if (!capacity)
capacity = 1;
@@ -8099,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
/*
* Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
*
* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8249,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
- int load_idx = get_sd_load_idx(env->sd, env->idle);
- unsigned long load;
int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
@@ -8262,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
env->flags |= LBF_NOHZ_AGAIN;
- /* Bias balancing toward CPUs of our domain: */
- if (local_group)
- load = target_load(i, load_idx);
- else
- load = source_load(i, load_idx);
-
- sgs->group_load += load;
+ sgs->group_load += cpu_runnable_load(rq);
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
@@ -8283,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -8302,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+ sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
@@ -8516,8 +8245,12 @@ next_group:
/* Update over-utilization (tipping point, U >= 0) indicator */
WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
} else if (sg_status & SG_OVERUTILIZED) {
- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
+ struct root_domain *rd = env->dst_rq->rd;
+
+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
}
}
@@ -8723,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
*
- * Also calculates the amount of weighted load which should be moved
+ * Also calculates the amount of runnable load which should be moved
* to restore balance.
*
* @env: The load balancing environment.
@@ -8768,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
- * isn't true due to cpus_allowed constraints and the like.
+ * isn't true due to cpus_ptr constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
@@ -8842,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
int i;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
- unsigned long capacity, wl;
+ unsigned long capacity, load;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -8896,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
rq->nr_running == 1)
continue;
- wl = weighted_cpuload(rq);
+ load = cpu_runnable_load(rq);
/*
- * When comparing with imbalance, use weighted_cpuload()
+ * When comparing with imbalance, use cpu_runnable_load()
* which is not scaled with the CPU capacity.
*/
- if (rq->nr_running == 1 && wl > env->imbalance &&
+ if (rq->nr_running == 1 && load > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
continue;
/*
* For the load comparisons with the other CPU's, consider
- * the weighted_cpuload() scaled with the CPU capacity, so
+ * the cpu_runnable_load() scaled with the CPU capacity, so
* that the load can be moved away from the CPU that is
* potentially running at a lower capacity.
*
- * Thus we're looking for max(wl_i / capacity_i), crosswise
+ * Thus we're looking for max(load_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * to: load_i * capacity_j > load_j * capacity_i; where j is
* our previous maximum.
*/
- if (wl * busiest_capacity > busiest_load * capacity) {
- busiest_load = wl;
+ if (load * busiest_capacity > busiest_load * capacity) {
+ busiest_load = load;
busiest_capacity = capacity;
busiest = rq;
}
@@ -9210,7 +8943,7 @@ more_balance:
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
@@ -9879,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- cpu_load_update_idle(rq);
rq_unlock_irqrestore(rq, &rf);
if (flags & NOHZ_BALANCE_KICK)
@@ -10690,6 +10422,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_change_group = task_change_group_fair,
#endif
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
@@ -10737,3 +10473,83 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
+
+/*
+ * Helper functions to facilitate extracting info from tracepoints.
+ */
+
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_SMP
+ return cfs_rq ? &cfs_rq->avg : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
+
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
+{
+ if (!cfs_rq) {
+ if (str)
+ strlcpy(str, "(null)", len);
+ else
+ return NULL;
+ }
+
+ cfs_rq_tg_path(cfs_rq, str, len);
+ return str;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
+
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
+
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq ? &rq->avg_rt : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
+
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq ? &rq->avg_dl : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
+ return rq ? &rq->avg_irq : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
+
+int sched_trace_rq_cpu(struct rq *rq)
+{
+ return rq ? cpu_of(rq) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
+{
+#ifdef CONFIG_SMP
+ return rd ? rd->span : NULL;
+#else
+ return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..80940939b733 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic entry points for the idle threads and
* implementation of the idle task scheduling class.
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 687302051a27..123ea07a3f3b 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Housekeeping management. Manage the targets for routine code that can run on
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 3cd8a3a795d2..aa8d75804108 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*
* membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include "sched.h"
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,8 @@
#include "sched.h"
#include "pelt.h"
+#include <trace/events/sched.h>
+
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+ trace_pelt_se_tp(se);
return 1;
}
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
cfs_se_util_change(&se->avg);
+ trace_pelt_se_tp(se);
return 1;
}
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1, 1);
+ trace_pelt_cfs_tp(cfs_rq);
return 1;
}
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_rt, 1, 1);
+ trace_pelt_rt_tp(rq);
return 1;
}
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
running)) {
___update_load_avg(&rq->avg_dl, 1, 1);
+ trace_pelt_dl_tp(rq);
return 1;
}
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
* reflect the real amount of computation
*/
running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
- running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
/*
* We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
1,
1);
- if (ret)
+ if (ret) {
___update_load_avg(&rq->avg_irq, 1, 1);
+ trace_pelt_irq_tp(rq);
+ }
return ret;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
* Scale the elapsed time to reflect the real amount of
* computation
*/
- delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+ delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
rq->clock_pelt += delta;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..a532558a5176 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_allowed))
+ cpumask_test_cpu(cpu, p->cpus_ptr))
return 1;
return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+ !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
task_running(rq, task) ||
!rt_task(task) ||
!task_on_rq_queued(task))) {
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+
+#ifdef CONFIG_UCLAMP_TASK
+ .uclamp_enabled = 1,
+#endif
};
#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..802b1f3405f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
extern void calc_global_load_tick(struct rq *this_rq);
extern long calc_load_fold_active(struct rq *this_rq, long adjust);
-#ifdef CONFIG_SMP
-extern void cpu_load_update_active(struct rq *this_rq);
-#else
-static inline void cpu_load_update_active(struct rq *this_rq) { }
-#endif
-
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
u64 runtime_expires;
int expires_seq;
- short idle;
- short period_active;
+ u8 idle;
+ u8 period_active;
+ u8 distribute_running;
+ u8 slack_started;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
int nr_periods;
int nr_throttled;
u64 throttled_time;
-
- bool distribute_running;
#endif
};
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+ unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+ unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ * utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ * maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+ unsigned int value;
+ struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
+
/*
* This is the main, per-CPU runqueue data structure.
*
@@ -818,8 +854,6 @@ struct rq {
unsigned int nr_preferred_running;
unsigned int numa_migrate_on;
#endif
- #define CPU_LOAD_IDX_MAX 5
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
atomic_t nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */
- /* capture load from *all* tasks on this CPU: */
- struct load_weight load;
unsigned long nr_load_updates;
u64 nr_switches;
+#ifdef CONFIG_UCLAMP_TASK
+ /* Utilization clamp values based on CPU's RUNNABLE tasks */
+ struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
+ unsigned int uclamp_flags;
+#define UCLAMP_FLAG_IDLE 0x01
+#endif
+
struct cfs_rq cfs;
struct rt_rq rt;
struct dl_rq dl;
@@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40];
struct sched_class {
const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+ int uclamp_enabled;
+#endif
+
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_UCLAMP_TASK
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+
+static __always_inline
+unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+ struct task_struct *p)
+{
+ unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+ unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ if (p) {
+ min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
+ max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+ }
+
+ /*
+ * Since CPU's {min,max}_util clamps are MAX aggregated considering
+ * RUNNABLE tasks with _different_ clamps, we can end up with an
+ * inversion. Fix it now when the clamps are applied.
+ */
+ if (unlikely(min_util >= max_util))
+ return min_util;
+
+ return clamp(util, min_util, max_util);
+}
+
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+ return uclamp_util_with(rq, util, NULL);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+ struct task_struct *p)
+{
+ return util;
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+ return util;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
}
#endif
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
/**
* enum schedutil_type - CPU utilization type
* @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2253,15 +2337,11 @@ enum schedutil_type {
ENERGY_UTIL,
};
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
- unsigned long max, enum schedutil_type type);
-
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
-{
- unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
- return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
-}
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p);
static inline unsigned long cpu_bw_dl(struct rq *rq)
{
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
return READ_ONCE(rq->avg_rt.util_avg);
}
#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long max, enum schedutil_type type,
+ struct task_struct *p)
{
- return cfs;
+ return 0;
}
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
.imbalance_pct = 125,
.cache_nice_tries = 0,
- .busy_idx = 0,
- .idle_idx = 0,
- .newidle_idx = 0,
- .wake_idx = 0,
- .forkexec_idx = 0,
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
#ifdef CONFIG_NUMA
} else if (sd->flags & SD_NUMA) {
sd->cache_nice_tries = 2;
- sd->busy_idx = 3;
- sd->idle_idx = 2;
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
sd->cache_nice_tries = 1;
- sd->busy_idx = 2;
- sd->idle_idx = 1;
}
/*
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
unsigned long cap;
/* Is there any asymmetry? */
- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
for_each_cpu(i, cpu_map) {
- if (arch_scale_cpu_capacity(NULL, i) != cap) {
+ if (arch_scale_cpu_capacity(i) != cap) {
asym = true;
break;
}
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
* to everyone.
*/
for_each_cpu(i, cpu_map) {
- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
int tl_id = 0;
for_each_sd_topology(tl) {
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
for_each_cpu_and(j, tl->mask(i), cpu_map) {
unsigned long capacity;
- capacity = arch_scale_cpu_capacity(NULL, j);
+ capacity = arch_scale_cpu_capacity(j);
if (capacity <= max_capacity)
continue;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 6eb1f8efd221..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic waiting primitives.
*
@@ -117,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
bookmark.func = NULL;
INIT_LIST_HEAD(&bookmark.entry);
- spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
- spin_unlock_irqrestore(&wq_head->lock, flags);
-
- while (bookmark.flags & WQ_FLAG_BOOKMARK) {
+ do {
spin_lock_irqsave(&wq_head->lock, flags);
nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
- }
+ } while (bookmark.flags & WQ_FLAG_BOOKMARK);
}
/**
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index c67c6d24adc2..45eba18a2898 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
diff --git a/kernel/signal.c b/kernel/signal.c
index a1eb44dc9ff5..35e97f4073c2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/signal.c
*
@@ -44,6 +45,7 @@
#include <linux/posix-timers.h>
#include <linux/livepatch.h>
#include <linux/cgroup.h>
+#include <linux/audit.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -53,7 +55,6 @@
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
-#include "audit.h" /* audit_signal_info() */
/*
* SLAB caches for signal bits.
@@ -2484,6 +2485,8 @@ relock:
if (signal_group_exit(signal)) {
ksig->info.si_signo = signr = SIGKILL;
sigdelset(&current->pending.signal, SIGKILL);
+ trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
+ &sighand->action[SIGKILL - 1]);
recalc_sigpending();
goto fatal;
}
@@ -2909,7 +2912,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask);
* This is useful for syscalls such as ppoll, pselect, io_pgetevents and
* epoll_pwait where a new sigmask is passed in from userland for the syscalls.
*/
-void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
+void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved,
+ bool interrupted)
{
if (!usigmask)
@@ -2919,7 +2923,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved)
* Restoring sigmask here can lead to delivering signals that the above
* syscalls are intended to block because of the sigmask passed in.
*/
- if (signal_pending(current)) {
+ if (interrupted) {
current->saved_sigmask = *sigsaved;
set_restore_sigmask();
return;
@@ -3618,12 +3622,11 @@ static struct pid *pidfd_to_pid(const struct file *file)
}
/**
- * sys_pidfd_send_signal - send a signal to a process through a task file
- * descriptor
- * @pidfd: the file descriptor of the process
- * @sig: signal to be sent
- * @info: the signal info
- * @flags: future flags to be passed
+ * sys_pidfd_send_signal - Signal a process through a pidfd
+ * @pidfd: file descriptor of the process
+ * @sig: signal to send
+ * @info: signal info
+ * @flags: future flags
*
* The syscall currently only signals via PIDTYPE_PID which covers
* kill(<positive-pid>, <signal>. It does not signal threads or process
diff --git a/kernel/smp.c b/kernel/smp.c
index f4cf1b0bb3b8..616d4d114847 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic helpers for smp ipi calls
*
@@ -33,7 +34,7 @@ struct call_function_data {
cpumask_var_t cpumask_ipi;
};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
+static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
@@ -486,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many);
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
-int smp_call_function(smp_call_func_t func, void *info, int wait)
+void smp_call_function(smp_call_func_t func, void *info, int wait)
{
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
-
- return 0;
}
EXPORT_SYMBOL(smp_call_function);
@@ -593,18 +592,16 @@ void __init smp_init(void)
* early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
* of local_irq_disable/enable().
*/
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
+void on_each_cpu(void (*func) (void *info), void *info, int wait)
{
unsigned long flags;
- int ret = 0;
preempt_disable();
- ret = smp_call_function(func, info, wait);
+ smp_call_function(func, info, wait);
local_irq_save(flags);
func(info);
local_irq_restore(flags);
preempt_enable();
- return ret;
}
EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index c230c2dd48e1..2efe1e206167 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Common SMP CPU bringup/teardown functions
*/
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c3382378d94..0427a86743a4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -1,10 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/softirq.c
*
* Copyright (C) 1992 Linus Torvalds
*
- * Distribute under GPLv2.
- *
* Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
*/
@@ -650,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu)
/* Find end, append list for that CPU. */
if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
*__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
- this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
+ __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
per_cpu(tasklet_vec, cpu).head = NULL;
per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 27bafc1e271e..36139de0a3c4 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/stacktrace.c
*
@@ -206,7 +207,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store,
ret = arch_stack_walk_reliable(consume_entry, &c, tsk);
put_task_stack(tsk);
- return ret;
+ return ret ? ret : c.len;
}
#endif
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 7231fb5953fc..b4f83f7bdf86 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* kernel/stop_machine.c
*
@@ -5,8 +6,6 @@
* Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
- *
- * This file is released under the GPLv2 and any later version.
*/
#include <linux/completion.h>
#include <linux/cpu.h>
@@ -178,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata)
set_state(msdata, msdata->state + 1);
}
+void __weak stop_machine_yield(const struct cpumask *cpumask)
+{
+ cpu_relax();
+}
+
/* This is the cpu_stop function which stops the CPU. */
static int multi_cpu_stop(void *data)
{
struct multi_stop_data *msdata = data;
enum multi_stop_state curstate = MULTI_STOP_NONE;
int cpu = smp_processor_id(), err = 0;
+ const struct cpumask *cpumask;
unsigned long flags;
bool is_active;
@@ -193,15 +198,18 @@ static int multi_cpu_stop(void *data)
*/
local_save_flags(flags);
- if (!msdata->active_cpus)
- is_active = cpu == cpumask_first(cpu_online_mask);
- else
- is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+ if (!msdata->active_cpus) {
+ cpumask = cpu_online_mask;
+ is_active = cpu == cpumask_first(cpumask);
+ } else {
+ cpumask = msdata->active_cpus;
+ is_active = cpumask_test_cpu(cpu, cpumask);
+ }
/* Simple state machine */
do {
/* Chill out and ensure we re-read multi_stop_state. */
- cpu_relax_yield();
+ stop_machine_yield(cpumask);
if (msdata->state != curstate) {
curstate = msdata->state;
switch (curstate) {
diff --git a/kernel/sys.c b/kernel/sys.c
index bdbfe8d37418..2969304c29fe 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1882,13 +1882,14 @@ exit_err:
}
/*
+ * Check arithmetic relations of passed addresses.
+ *
* WARNING: we don't require any capability here so be very careful
* in what is allowed for modification from userspace.
*/
-static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
{
unsigned long mmap_max_addr = TASK_SIZE;
- struct mm_struct *mm = current->mm;
int error = -EINVAL, i;
static const unsigned char offsets[] = {
@@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
prctl_map->start_data))
goto out;
- /*
- * Someone is trying to cheat the auxv vector.
- */
- if (prctl_map->auxv_size) {
- if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
- goto out;
- }
-
- /*
- * Finally, make sure the caller has the rights to
- * change /proc/pid/exe link: only local sys admin should
- * be allowed to.
- */
- if (prctl_map->exe_fd != (u32)-1) {
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
- goto out;
- }
-
error = 0;
out:
return error;
@@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
return -EFAULT;
- error = validate_prctl_map(&prctl_map);
+ error = validate_prctl_map_addr(&prctl_map);
if (error)
return error;
if (prctl_map.auxv_size) {
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (!prctl_map.auxv ||
+ prctl_map.auxv_size > sizeof(mm->saved_auxv))
+ return -EINVAL;
+
memset(user_auxv, 0, sizeof(user_auxv));
if (copy_from_user(user_auxv,
(const void __user *)prctl_map.auxv,
@@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
}
if (prctl_map.exe_fd != (u32)-1) {
+ /*
+ * Make sure the caller has the rights to
+ * change /proc/pid/exe link: only local sys admin should
+ * be allowed to.
+ */
+ if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EINVAL;
+
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
if (error)
return error;
@@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
struct mm_struct *mm = current->mm;
- struct prctl_mm_map prctl_map;
+ struct prctl_mm_map prctl_map = {
+ .auxv = NULL,
+ .auxv_size = 0,
+ .exe_fd = -1,
+ };
struct vm_area_struct *vma;
int error;
@@ -2125,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = -EINVAL;
- down_write(&mm->mmap_sem);
+ /*
+ * arg_lock protects concurent updates of arg boundaries, we need
+ * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
+ * validation.
+ */
+ down_read(&mm->mmap_sem);
vma = find_vma(mm, addr);
+ spin_lock(&mm->arg_lock);
prctl_map.start_code = mm->start_code;
prctl_map.end_code = mm->end_code;
prctl_map.start_data = mm->start_data;
@@ -2139,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr,
prctl_map.arg_end = mm->arg_end;
prctl_map.env_start = mm->env_start;
prctl_map.env_end = mm->env_end;
- prctl_map.auxv = NULL;
- prctl_map.auxv_size = 0;
- prctl_map.exe_fd = -1;
switch (opt) {
case PR_SET_MM_START_CODE:
@@ -2181,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
goto out;
}
- error = validate_prctl_map(&prctl_map);
+ error = validate_prctl_map_addr(&prctl_map);
if (error)
goto out;
@@ -2218,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
- up_write(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
+ up_read(&mm->mmap_sem);
return error;
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 943c89178e3d..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* sysctl.c: General linux system control interface
*
@@ -229,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
#endif
static int proc_dopipe_max_size(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
-#ifdef CONFIG_BPF_SYSCALL
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos);
-#endif
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses its own private copy */
@@ -456,6 +452,22 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
+#ifdef CONFIG_UCLAMP_TASK
+ {
+ .procname = "sched_util_clamp_min",
+ .data = &sysctl_sched_uclamp_util_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+ {
+ .procname = "sched_util_clamp_max",
+ .data = &sysctl_sched_uclamp_util_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sysctl_sched_uclamp_handler,
+ },
+#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -1252,12 +1264,10 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "bpf_stats_enabled",
- .data = &sysctl_bpf_stats_enabled,
- .maxlen = sizeof(sysctl_bpf_stats_enabled),
+ .data = &bpf_stats_enabled_key.key,
+ .maxlen = sizeof(bpf_stats_enabled_key),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax_bpf_stats,
- .extra1 = &zero,
- .extra2 = &one,
+ .proc_handler = proc_do_static_key,
},
#endif
#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU)
@@ -3373,26 +3383,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
#endif /* CONFIG_PROC_SYSCTL */
-#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
-static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+#if defined(CONFIG_SYSCTL)
+int proc_do_static_key(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
{
- int ret, bpf_stats = *(int *)table->data;
- struct ctl_table tmp = *table;
+ struct static_key *key = (struct static_key *)table->data;
+ static DEFINE_MUTEX(static_key_mutex);
+ int val, ret;
+ struct ctl_table tmp = {
+ .data = &val,
+ .maxlen = sizeof(val),
+ .mode = table->mode,
+ .extra1 = &zero,
+ .extra2 = &one,
+ };
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
- tmp.data = &bpf_stats;
+ mutex_lock(&static_key_mutex);
+ val = static_key_enabled(key);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && !ret) {
- *(int *)table->data = bpf_stats;
- if (bpf_stats)
- static_branch_enable(&bpf_stats_enabled_key);
+ if (val)
+ static_key_enable(key);
else
- static_branch_disable(&bpf_stats_enabled_key);
+ static_key_disable(key);
}
+ mutex_unlock(&static_key_mutex);
return ret;
}
#endif
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 5f852b8f59f7..13a0f2e6ebc2 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -1,19 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* taskstats.c - Export per-task statistics to userland
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
* (C) Balbir Singh, IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 7bca480151b0..76c997fdbc9d 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* test_kprobes.c - simple sanity test for *probes
*
* Copyright IBM Corp. 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#define pr_fmt(fmt) "Kprobe smoke test: " fmt
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e2c038d6c13c..fcc42353f125 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Timer subsystem related configuration options
#
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f1e46f338a9c..1867044800bb 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
endif
obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o
obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o
+obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0519a8805aab..57518efc3810 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
/**
* alarmtimer_suspend - Suspend time callback
* @dev: unused
- * @state: unused
*
* When we are going into suspend, we look through the bases
* to see which is the soonest timer to expire. We then
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 3bcc19ceb073..fff5f64981c6 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
-static void inline clocksource_watchdog_lock(unsigned long *flags)
+static inline void clocksource_watchdog_lock(unsigned long *flags)
{
spin_lock_irqsave(&watchdog_lock, *flags);
}
-static void inline clocksource_watchdog_unlock(unsigned long *flags)
+static inline void clocksource_watchdog_unlock(unsigned long *flags)
{
spin_unlock_irqrestore(&watchdog_lock, *flags);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 41dfff23c1f9..5ee77f1a8a92 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -30,7 +30,6 @@
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
-#include <linux/seq_file.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
@@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
* @timer: hrtimer to stop
*
* Returns:
- * 0 when the timer was not active
- * 1 when the timer was active
- * -1 when the timer is currently executing the callback function and
+ *
+ * * 0 when the timer was not active
+ * * 1 when the timer was active
+ * * -1 when the timer is currently executing the callback function and
* cannot be stopped
*/
int hrtimer_try_to_cancel(struct hrtimer *timer)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8de4f789dc1b..65eb796610dc 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -43,6 +43,7 @@ static u64 tick_length_base;
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+#define MAX_TAI_OFFSET 100000
/*
* phase-lock loop variables
@@ -691,7 +692,8 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc,
time_constant = max(time_constant, 0l);
}
- if (txc->modes & ADJ_TAI && txc->constant >= 0)
+ if (txc->modes & ADJ_TAI &&
+ txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET)
*time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 29176635991f..d7f2d91acdac 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -980,23 +980,16 @@ retry_delete:
*/
static void itimer_delete(struct k_itimer *timer)
{
- unsigned long flags;
-
retry_delete:
- spin_lock_irqsave(&timer->it_lock, flags);
+ spin_lock_irq(&timer->it_lock);
if (timer_delete_hook(timer) == TIMER_RETRY) {
- unlock_timer(timer, flags);
+ spin_unlock_irq(&timer->it_lock);
goto retry_delete;
}
list_del(&timer->list);
- /*
- * This keeps any tasks waiting on the spin lock from thinking
- * they got something (see the lock code above).
- */
- timer->it_signal = NULL;
- unlock_timer(timer, flags);
+ spin_unlock_irq(&timer->it_lock);
release_posix_timer(timer, IT_ID_SET);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
*/
if (!ts->tick_stopped) {
calc_load_nohz_start();
- cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
tick_do_update_jiffies64(now);
- cpu_load_update_nohz_stop();
/*
* Clear the timer idle flag, so we avoid IPIs on remote queueing and
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7f7d6914ddd5..5c54ca632d08 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
if (tv) {
if (compat_get_timeval(&user_tv, tv))
return -EFAULT;
+
+ if (!timeval_valid(&user_tv))
+ return -EINVAL;
+
new_ts.tv_sec = user_tv.tv_sec;
new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85f5912d8f70..d911c8470149 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
struct timekeeper *tk = &tk_core.timekeeper;
unsigned int seq;
ktime_t base, *offset = offsets[offs];
+ u64 nsecs;
WARN_ON(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
+ nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
} while (read_seqcount_retry(&tk_core.seq, seq));
- return base;
-
+ return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 98ba50dcb1b2..acb326f5f50a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
SEQ_printf(m, "\n");
}
-static int timer_list_show(struct seq_file *m, void *v)
-{
- struct timer_list_iter *iter = v;
-
- if (iter->cpu == -1 && !iter->second_pass)
- timer_list_header(m, iter->now);
- else if (!iter->second_pass)
- print_cpu(m, iter->cpu, iter->now);
-#ifdef CONFIG_GENERIC_CLOCKEVENTS
- else if (iter->cpu == -1 && iter->second_pass)
- timer_list_show_tickdevices_header(m);
- else
- print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
-#endif
- return 0;
-}
-
void sysrq_timer_list_show(void)
{
u64 now = ktime_to_ns(ktime_get());
@@ -317,6 +300,24 @@ void sysrq_timer_list_show(void)
return;
}
+#ifdef CONFIG_PROC_FS
+static int timer_list_show(struct seq_file *m, void *v)
+{
+ struct timer_list_iter *iter = v;
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, iter->now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
static void *move_iter(struct timer_list_iter *iter, loff_t offset)
{
for (; offset; offset--) {
@@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void)
return 0;
}
__initcall(init_timer_list_procfs);
+#endif
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
new file mode 100644
index 000000000000..a80893180826
--- /dev/null
+++ b/kernel/time/vsyscall.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 ARM Ltd.
+ *
+ * Generic implementation of update_vsyscall and update_vsyscall_tz.
+ *
+ * Based on the x86 specific implementation.
+ */
+
+#include <linux/hrtimer.h>
+#include <linux/timekeeper_internal.h>
+#include <vdso/datapage.h>
+#include <vdso/helpers.h>
+#include <vdso/vsyscall.h>
+
+static inline void update_vdso_data(struct vdso_data *vdata,
+ struct timekeeper *tk)
+{
+ struct vdso_timestamp *vdso_ts;
+ u64 nsec;
+
+ vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
+ vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
+ vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
+ vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
+ vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
+ vdata[CS_RAW].mask = tk->tkr_raw.mask;
+ vdata[CS_RAW].mult = tk->tkr_raw.mult;
+ vdata[CS_RAW].shift = tk->tkr_raw.shift;
+
+ /* CLOCK_REALTIME */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
+ /* CLOCK_MONOTONIC */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+ vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec;
+ nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+ while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
+ vdso_ts->sec++;
+ }
+ vdso_ts->nsec = nsec;
+
+ /* CLOCK_MONOTONIC_RAW */
+ vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts->sec = tk->raw_sec;
+ vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
+
+ /* CLOCK_BOOTTIME */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+ vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec;
+ nsec += ((u64)(tk->wall_to_monotonic.tv_nsec +
+ ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift);
+ while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
+ nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift);
+ vdso_ts->sec++;
+ }
+ vdso_ts->nsec = nsec;
+
+ /* CLOCK_TAI */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+ vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
+
+ /*
+ * Read without the seqlock held by clock_getres().
+ * Note: No need to have a second copy.
+ */
+ WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+}
+
+void update_vsyscall(struct timekeeper *tk)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_timestamp *vdso_ts;
+ u64 nsec;
+
+ if (__arch_update_vdso_data()) {
+ /*
+ * Some architectures might want to skip the update of the
+ * data page.
+ */
+ return;
+ }
+
+ /* copy vsyscall data */
+ vdso_write_begin(vdata);
+
+ vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk);
+ vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk);
+
+ /* CLOCK_REALTIME_COARSE */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+ vdso_ts->sec = tk->xtime_sec;
+ vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+
+ /* CLOCK_MONOTONIC_COARSE */
+ vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+ vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = nsec + tk->wall_to_monotonic.tv_nsec;
+ while (nsec >= NSEC_PER_SEC) {
+ nsec = nsec - NSEC_PER_SEC;
+ vdso_ts->sec++;
+ }
+ vdso_ts->nsec = nsec;
+
+ if (__arch_use_vsyscall(vdata))
+ update_vdso_data(vdata, tk);
+
+ __arch_update_vsyscall(vdata, tk);
+
+ vdso_write_end(vdata);
+
+ __arch_sync_vdso_data(vdata);
+}
+
+void update_vsyscall_tz(void)
+{
+ struct vdso_data *vdata = __arch_get_k_vdso_data();
+
+ if (__arch_use_vsyscall(vdata)) {
+ vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+ }
+
+ __arch_sync_vdso_data(vdata);
+}
diff --git a/kernel/torture.c b/kernel/torture.c
index 17b2be9bde12..a8d9bdfba7c3 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void)
static struct task_struct *stutter_task;
static int stutter_pause_test;
static int stutter;
+static int stutter_gap;
/*
* Block until the stutter interval ends. This must be called periodically
@@ -578,10 +579,12 @@ static int stutter;
bool stutter_wait(const char *title)
{
int spt;
+ bool ret = false;
cond_resched_tasks_rcu_qs();
spt = READ_ONCE(stutter_pause_test);
for (; spt; spt = READ_ONCE(stutter_pause_test)) {
+ ret = true;
if (spt == 1) {
schedule_timeout_interruptible(1);
} else if (spt == 2) {
@@ -592,7 +595,7 @@ bool stutter_wait(const char *title)
}
torture_shutdown_absorb(title);
}
- return !!spt;
+ return ret;
}
EXPORT_SYMBOL_GPL(stutter_wait);
@@ -602,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait);
*/
static int torture_stutter(void *arg)
{
+ int wtime;
+
VERBOSE_TOROUT_STRING("torture_stutter task started");
do {
if (!torture_must_stop() && stutter > 1) {
- WRITE_ONCE(stutter_pause_test, 1);
- schedule_timeout_interruptible(stutter - 1);
+ wtime = stutter;
+ if (stutter > HZ + 1) {
+ WRITE_ONCE(stutter_pause_test, 1);
+ wtime = stutter - HZ - 1;
+ schedule_timeout_interruptible(wtime);
+ wtime = HZ + 1;
+ }
WRITE_ONCE(stutter_pause_test, 2);
- schedule_timeout_interruptible(1);
+ schedule_timeout_interruptible(wtime);
}
WRITE_ONCE(stutter_pause_test, 0);
if (!torture_must_stop())
- schedule_timeout_interruptible(stutter);
+ schedule_timeout_interruptible(stutter_gap);
torture_shutdown_absorb("torture_stutter");
} while (!torture_must_stop());
torture_kthread_stopping("torture_stutter");
@@ -622,9 +632,10 @@ static int torture_stutter(void *arg)
/*
* Initialize and kick off the torture_stutter kthread.
*/
-int torture_stutter_init(const int s)
+int torture_stutter_init(const int s, const int sgap)
{
stutter = s;
+ stutter_gap = sgap;
return torture_create_kthread(torture_stutter, NULL, stutter_task);
}
EXPORT_SYMBOL_GPL(torture_stutter_init);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d965cef6c77..564e5fdb025f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Architectures that offer an FUNCTION_TRACER implementation should
# select HAVE_FUNCTION_TRACER:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b496ffdf5f36..1c9a4745e596 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -410,8 +410,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
.arg4_type = ARG_CONST_SIZE,
};
-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
-
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_sample_data *sd)
@@ -442,24 +440,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
return perf_event_output(event, sd, regs);
}
+/*
+ * Support executing tracepoints in normal, irq, and nmi context that each call
+ * bpf_perf_event_output
+ */
+struct bpf_trace_sample_data {
+ struct perf_sample_data sds[3];
+};
+
+static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
+static DEFINE_PER_CPU(int, bpf_trace_nest_level);
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags, void *, data, u64, size)
{
- struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
+ struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
+ int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
struct perf_raw_record raw = {
.frag = {
.size = size,
.data = data,
},
};
+ struct perf_sample_data *sd;
+ int err;
- if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
- return -EINVAL;
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ sd = &sds->sds[nest_level - 1];
+
+ if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
+ err = -EINVAL;
+ goto out;
+ }
perf_sample_data_init(sd, 0, 0);
sd->raw = &raw;
- return __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, sd);
+
+out:
+ this_cpu_dec(bpf_trace_nest_level);
+ return err;
}
static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -822,16 +846,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
+ *
+ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
+ * in normal, irq, and nmi context.
*/
-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
+struct bpf_raw_tp_regs {
+ struct pt_regs regs[3];
+};
+static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
+static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
+static struct pt_regs *get_bpf_raw_tp_regs(void)
+{
+ struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
+
+ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
+ this_cpu_dec(bpf_raw_tp_nest_level);
+ return ERR_PTR(-EBUSY);
+ }
+
+ return &tp_regs->regs[nest_level - 1];
+}
+
+static void put_bpf_raw_tp_regs(void)
+{
+ this_cpu_dec(bpf_raw_tp_nest_level);
+}
+
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
- return ____bpf_perf_event_output(regs, map, flags, data, size);
+ ret = ____bpf_perf_event_output(regs, map, flags, data, size);
+
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
@@ -848,12 +904,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
- return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
- flags, 0, 0);
+ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+ flags, 0, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
@@ -868,11 +930,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
void *, buf, u32, size, u64, flags)
{
- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+ struct pt_regs *regs = get_bpf_raw_tp_regs();
+ int ret;
+
+ if (IS_ERR(regs))
+ return PTR_ERR(regs);
perf_fetch_caller_regs(regs);
- return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
- (unsigned long) size, flags, 0);
+ ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+ put_bpf_raw_tp_regs();
+ return ret;
}
static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
@@ -1297,7 +1365,8 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
}
#ifdef CONFIG_MODULES
-int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module)
+static int bpf_event_notify(struct notifier_block *nb, unsigned long op,
+ void *module)
{
struct bpf_trace_module *btm, *tmp;
struct module *mod = module;
@@ -1336,7 +1405,7 @@ static struct notifier_block bpf_module_nb = {
.notifier_call = bpf_event_notify,
};
-int __init bpf_event_init(void)
+static int __init bpf_event_init(void)
{
register_module_notifier(&bpf_module_nb);
return 0;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a12aff849c04..576c41644e77 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2935,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
p = &pg->records[i];
p->flags = rec_flags;
-#ifndef CC_USING_NOP_MCOUNT
/*
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!ftrace_code_disable(mod, p))
+ if (!__is_defined(CC_USING_NOP_MCOUNT) &&
+ !ftrace_code_disable(mod, p))
break;
-#endif
update_cnt++;
}
@@ -4221,10 +4220,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper,
struct ftrace_func_entry *entry;
struct ftrace_func_map *map;
struct hlist_head *hhd;
- int size = 1 << mapper->hash.size_bits;
- int i;
+ int size, i;
+
+ if (!mapper)
+ return;
if (free_func && mapper->hash.count) {
+ size = 1 << mapper->hash.size_bits;
for (i = 0; i < size; i++) {
hhd = &mapper->hash.buckets[i];
hlist_for_each_entry(entry, hhd, hlist) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c92b3d9ea30..c3aabb576fe5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6719,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
break;
}
#endif
- if (!tr->allocated_snapshot) {
+ if (tr->allocated_snapshot)
+ ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ &tr->trace_buffer, iter->cpu_file);
+ else
ret = tracing_alloc_snapshot_instance(tr);
- if (ret < 0)
- break;
- }
+ if (ret < 0)
+ break;
local_irq_disable();
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
@@ -6923,7 +6925,7 @@ struct tracing_log_err {
static DEFINE_MUTEX(tracing_err_log_lock);
-struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
{
struct tracing_log_err *err;
@@ -7126,12 +7128,24 @@ static ssize_t tracing_err_log_write(struct file *file,
return count;
}
+static int tracing_err_log_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ if (file->f_mode & FMODE_READ)
+ seq_release(inode, file);
+
+ return 0;
+}
+
static const struct file_operations tracing_err_log_fops = {
.open = tracing_err_log_open,
.write = tracing_err_log_write,
.read = seq_read,
.llseek = seq_lseek,
- .release = tracing_release_generic_tr,
+ .release = tracing_err_log_release,
};
static int tracing_buffers_open(struct inode *inode, struct file *filp)
@@ -8192,7 +8206,7 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
-struct dentry *trace_instance_dir;
+static struct dentry *trace_instance_dir;
static void
init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -8910,12 +8924,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
cnt++;
- /* reset all but tr, trace, and overruns */
- memset(&iter.seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ trace_iterator_reset(&iter);
iter.iter_flags |= TRACE_FILE_LAT_FMT;
- iter.pos = -1;
if (trace_find_next_entry_inc(&iter) != NULL) {
int ret;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1974ce818ddb..005f08629b8b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -15,7 +15,6 @@
#include <linux/trace_seq.h>
#include <linux/trace_events.h>
#include <linux/compiler.h>
-#include <linux/trace_seq.h>
#include <linux/glob.h>
#ifdef CONFIG_FTRACE_SYSCALLS
@@ -1967,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { }
extern struct trace_iterator *tracepoint_print_iter;
+/*
+ * Reset the state of the trace_iterator so that it can read consumed data.
+ * Normally, the trace_iterator is used for reading the data when it is not
+ * consumed, and must retain state.
+ */
+static __always_inline void trace_iterator_reset(struct trace_iterator *iter)
+{
+ const size_t offset = offsetof(struct trace_iterator, seq);
+
+ /*
+ * Keep gcc from complaining about overwriting more than just one
+ * member in the structure.
+ */
+ memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset);
+
+ iter->pos = -1;
+}
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d3e59312ef40..5079d1db3754 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -428,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL);
if (!op_stack)
return ERR_PTR(-ENOMEM);
- prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
+ prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL);
if (!prog_stack) {
parse_error(pe, -ENOMEM, 0);
goto out_free;
@@ -579,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,
out_free:
kfree(op_stack);
kfree(inverts);
- kfree(prog_stack);
+ if (prog_stack) {
+ for (i = 0; prog_stack[i].pred; i++)
+ kfree(prog_stack[i].pred);
+ kfree(prog_stack);
+ }
return ERR_PTR(ret);
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 7fca3457c705..ca6b0dff60c5 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -59,7 +59,7 @@
C(NO_CLOSING_PAREN, "No closing paren found"), \
C(SUBSYS_NOT_FOUND, "Missing subsystem"), \
C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \
- C(INVALID_REF_KEY, "Using variable references as keys not supported"), \
+ C(INVALID_REF_KEY, "Using variable references in keys not supported"), \
C(VAR_NOT_FOUND, "Couldn't find variable"), \
C(FIELD_NOT_FOUND, "Couldn't find field"),
@@ -1854,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field,
struct hist_elt_data *elt_data;
u64 var_val = 0;
+ if (WARN_ON_ONCE(!elt))
+ return var_val;
+
elt_data = elt->private_data;
var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
@@ -3582,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data)
struct track_data *track_data = tr->cond_snapshot->cond_data;
struct hist_elt_data *elt_data, *track_elt_data;
struct snapshot_context *context = cond_data;
+ struct action_data *action;
u64 track_val;
if (!track_data)
return false;
+ action = track_data->action_data;
+
track_val = get_track_val(track_data->hist_data, context->elt,
track_data->action_data);
+ if (!action->track_data.check_val(track_data->track_val, track_val))
+ return false;
+
track_data->track_val = track_val;
memcpy(track_data->key, context->key, track_data->key_len);
@@ -4503,7 +4512,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
goto out;
}
- if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (field_has_hist_vars(hist_field, 0)) {
hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str));
destroy_hist_field(hist_field, 0);
ret = -EINVAL;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
* of this thread, than stop migrating for the duration
* of the current test.
*/
- if (!cpumask_equal(current_mask, &current->cpus_allowed))
+ if (!cpumask_equal(current_mask, current->cpus_ptr))
goto disable;
get_online_cpus();
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 6c1ae6b752d1..cca65044c14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -37,12 +37,8 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (skip_entries)
kdb_printf("(skipping %d entries)\n", skip_entries);
- /* reset all but tr, trace, and overruns */
- memset(&iter.seq, 0,
- sizeof(struct trace_iterator) -
- offsetof(struct trace_iterator, seq));
+ trace_iterator_reset(&iter);
iter.iter_flags |= TRACE_FILE_LAT_FMT;
- iter.pos = -1;
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 54373d93e251..ba751f993c3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
trace_seq_puts(s, "<stack trace>\n");
- for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) {
+ for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) {
if (trace_seq_has_overflowed(s))
break;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index eb7e06b54741..b55906c77ce0 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -426,8 +426,6 @@ end:
/*
* Argument syntax:
* - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS]
- *
- * - Remove uprobe: -:[GRP/]EVENT
*/
static int trace_uprobe_create(int argc, const char **argv)
{
@@ -443,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv)
ret = 0;
ref_ctr_offset = 0;
- /* argc must be >= 1 */
- if (argv[0][0] == 'r')
+ switch (argv[0][0]) {
+ case 'r':
is_return = true;
- else if (argv[0][0] != 'p' || argc < 2)
+ break;
+ case 'p':
+ break;
+ default:
+ return -ECANCELED;
+ }
+
+ if (argc < 2)
return -ECANCELED;
if (argv[0][1] == ':')
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 46f2ab1e08a9..df3ade14ccbd 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2008-2014 Mathieu Desnoyers
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <linux/module.h>
#include <linux/mutex.h>
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 370724b45391..7be3e7530841 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -1,19 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* tsacct.c - System accounting over taskstats interface
*
* Copyright (C) Jay Lan, <jlan@sgi.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
diff --git a/kernel/ucount.c b/kernel/ucount.c
index f48d1b6376a4..feb128c7b5d9 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -1,9 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/stat.h>
#include <linux/sysctl.h>
diff --git a/kernel/umh.c b/kernel/umh.c
index d937cbad903a..7f255b5a8845 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* umh - the kernel usermode helper
*/
diff --git a/kernel/up.c b/kernel/up.c
index ff536f9cc8a2..862b460ab97a 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Uniprocessor-only support functions. The counterpart to kernel/smp.c
*/
@@ -34,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd)
}
EXPORT_SYMBOL(smp_call_function_single_async);
-int on_each_cpu(smp_call_func_t func, void *info, int wait)
+void on_each_cpu(smp_call_func_t func, void *info, int wait)
{
unsigned long flags;
local_irq_save(flags);
func(info);
local_irq_restore(flags);
- return 0;
}
EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 9586b670a5b2..870ecd7c63ed 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/user-return-notifier.h>
#include <linux/percpu.h>
diff --git a/kernel/user.c b/kernel/user.c
index 88b834f0eebc..78b17e36e705 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* The "user cache".
*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 923414a246e9..0eff45ce7703 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -1,9 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/nsproxy.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index dcd6be1996fe..f0e491193009 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2004 IBM Corporation
*
* Author: Serge Hallyn <serue@us.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
*/
#include <linux/export.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 258033d62cb3..3732c888a949 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007
*
* Author: Eric Biederman <ebiederm@xmision.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2 of the
- * License.
*/
#include <linux/export.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9657315405de..95aea04ff722 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/workqueue.c - generic async execution with shared worker pool
*